In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
%reload_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
from sklearn.cluster import MiniBatchKMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

In [3]:
import gensim

In [4]:
df = pd.read_csv('dataset/train_file.csv')


In [5]:
df.head(2)

Unnamed: 0,IDLink,Title,Headline,Source,Topic,PublishDate,Facebook,GooglePlus,LinkedIn,SentimentTitle,SentimentHeadline
0,Tr3CMgRv1N,Obama Lays Wreath at Arlington National Cemetery,Obama Lays Wreath at Arlington National Cemete...,USA TODAY,obama,2002-04-02 00:00:00,-1,-1,-1,0.0,-0.0533
1,Wc81vGp8qZ,A Look at the Health of the Chinese Economy,"Tim Haywood, investment director business-unit...",Bloomberg,economy,2008-09-20 00:00:00,-1,-1,-1,0.208333,-0.156386


In [6]:
df['Source'].nunique()

4753

In [7]:
df.dropna(subset=['IDLink'],inplace=True)

In [8]:
df.count()

IDLink               55932
Title                55932
Headline             55932
Source               55757
Topic                55932
PublishDate          55932
Facebook             55932
GooglePlus           55932
LinkedIn             55932
SentimentTitle       55932
SentimentHeadline    55932
dtype: int64

## preprocessing data

In [9]:
df1 = df

In [10]:
import string
from gensim.parsing.preprocessing import remove_stopwords

def remove_punctuation(input_str):
    punctuation_free = "".join(i for i in input_str if i not in string.punctuation)
    return punctuation_free


In [11]:
from nltk.stem import WordNetLemmatizer
#defining the object for Lemmatization
wordnet_lemmatizer = WordNetLemmatizer()
#defining the function for lemmatization
def lemmatizer(text):
    lemm_text = [wordnet_lemmatizer.lemmatize(word) for word in text]
    return lemm_text


In [12]:
import nltk

# nltk.download('wordnet')

In [13]:
def pre_process_data(input_str):
    input_str = remove_punctuation(input_str)
    input_str = remove_stopwords(input_str)
    input_str = gensim.utils.simple_preprocess(input_str)
    input_str = lemmatizer(input_str)
    return input_str

def feature_importance(df,model):
    columns = df.columns

    model.feature_importances_

    return dict(zip(columns,model.feature_importances_))

In [14]:
# df1['all_text'] = df1['Title']+df1['Headline']

# df1['all_text_processed'] = df1['all_text'].apply(lambda x: pre_process_data(x))

# df1.head(2)

# build model

In [17]:
df1['title_text'] = df1['Title'].apply(lambda x: pre_process_data(x))
df1['headline_text'] = df1['Headline'].apply(lambda x: pre_process_data(x))

In [18]:
template_text1 = df1.title_text
template_text2 = df1.headline_text

template_text = template_text1.append(template_text2)

In [19]:
len(template_text)

111864

In [20]:
template_text

0        [obama, lay, wreath, arlington, national, ceme...
1                         [look, health, chinese, economy]
2           [nouriel, roubini, global, economy, not, back]
3                              [finland, gdp, expands, in]
4        [tourism, govt, spending, buoy, thai, economy,...
                               ...                        
55927    [retired, cuban, leader, fidel, castro, slamme...
55928    [president, obama, caught, predictable, flak, ...
55929    [while, trump, want, large, tariff, import, ma...
55930    [microsoft, business, customer, finally, begin...
55931    [a, we, know, listening, campaign, rhetoric, c...
Length: 111864, dtype: object

In [21]:
df1.shape

(55932, 13)

In [22]:
from gensim.models import FastText

model = FastText(window=5, 
                min_count=5,
                workers=4,
                )

# build vocab first
model.build_vocab(template_text, progress_per=100)

model.epochs

model.train(template_text,total_examples=model.corpus_count,epochs=model.epochs)

model.save("word2_vec_fast_text.model")

In [23]:
model.wv.most_similar("obama")

[('exobama', 0.9890816807746887),
 ('obamaera', 0.977846622467041),
 ('boehnerobama', 0.9734190106391907),
 ('obr', 0.9707019925117493),
 ('postobama', 0.9538817405700684),
 ('antiobama', 0.9435935616493225),
 ('obamaquot', 0.9387995004653931),
 ('obamas', 0.9366296529769897),
 ('obamaseinfeld', 0.9065695405006409),
 ('obamacare', 0.8995159268379211)]

In [24]:
model.wv.most_similar('govt')

[('govts', 0.9193936586380005),
 ('governor', 0.8263310194015503),
 ('gov', 0.8227432370185852),
 ('vt', 0.7358570098876953),
 ('governance', 0.7076809406280518),
 ('morneau', 0.6870869398117065),
 ('government', 0.6830440163612366),
 ('fda', 0.6674199104309082),
 ('grgoiretrudeau', 0.6671798825263977),
 ('ppf', 0.6641440987586975)]

In [25]:
model.wv.get_vector('member')

array([-0.3398502 , -1.0200082 , -0.17734738,  0.6572385 ,  1.7991658 ,
        0.27955616,  0.58919024, -0.56918395,  1.2916338 , -2.346885  ,
        0.34812832, -0.40717015, -0.39574534,  0.49153358, -0.96604234,
       -0.36105618, -0.8926641 ,  1.0654751 ,  0.89797854,  0.24014531,
        0.1483076 ,  2.1264896 ,  0.0611372 ,  0.13122332,  0.9164747 ,
        0.47831082, -0.49389368,  0.25216138,  0.26483718,  0.82297087,
       -0.27040553, -0.51653945,  0.20785622, -1.7289991 ,  1.3457798 ,
        1.0604328 , -1.4932153 ,  0.16039884,  0.164087  ,  0.5405259 ,
        1.2203163 , -0.02340875,  0.2579502 , -1.0355451 , -0.84879214,
       -1.4256722 ,  1.2292757 , -0.04444612, -0.16129069, -0.36063337,
        0.37636867, -0.37528652,  0.28977218, -0.45630655,  1.1168013 ,
        0.7785162 , -0.9058634 ,  0.9831589 ,  0.03112453, -0.25844333,
       -0.11396223,  0.07335816, -1.0243016 , -0.09062236, -0.10551562,
       -0.03693302,  0.5811043 , -0.60966974,  0.3658824 , -0.77

# Load model

In [15]:
from gensim.models import Word2Vec

model = Word2Vec.load("word2_vec_fast_text.model")

In [16]:
def sentence_embedding(sentence,model=model):
    word_list = pre_process_data(sentence)
    word_list = [each for each in word_list if each in model.wv.key_to_index]
    sentence_vec = np.zeros((100,),dtype='float32')
    sentence_len= len(word_list)
    for each in word_list:
        each_vector = model.wv.get_vector(each)
        sentence_vec = np.add(sentence_vec,each_vector)
    if sentence_len>0:
        sentence_vec = sentence_vec/sentence_len
    return sentence_vec

In [17]:
def gen_similarity_score(sent1,sent2):
    sent1_embedding = sentence_embedding(sent1)
    sent2_embedding = sentence_embedding(sent2)
    return 1- spatial.distance.cosine(sent1_embedding,sent2_embedding)

## creating text embedding for title and headline text

In [18]:
df1 = pd.get_dummies(data=df1,columns=['Topic'])

In [19]:
df1['title_embedding']  = df1.apply(lambda x: sentence_embedding(x['Title']),axis=1)

df1['headline_embedding']  = df1.apply(lambda x: sentence_embedding(x['Headline']),axis=1)

In [20]:
title_columns = ['t'+str(i) for i in range(1,101)]
headline_columns = ['h'+str(i) for i in range(1,101)]

title_dataframe = pd.DataFrame(df1['title_embedding'].to_list(), columns = title_columns)
headline_dataframe = pd.DataFrame(df1['headline_embedding'].to_list(), columns = headline_columns)

In [21]:
df2 = pd.concat([df1,title_dataframe,headline_dataframe],axis=1)

In [22]:
df2.head()

Unnamed: 0,IDLink,Title,Headline,Source,PublishDate,Facebook,GooglePlus,LinkedIn,SentimentTitle,SentimentHeadline,...,h91,h92,h93,h94,h95,h96,h97,h98,h99,h100
0,Tr3CMgRv1N,Obama Lays Wreath at Arlington National Cemetery,Obama Lays Wreath at Arlington National Cemete...,USA TODAY,2002-04-02 00:00:00,-1,-1,-1,0.0,-0.0533,...,-0.543032,-0.254333,-0.232355,0.131315,-0.09142,0.867266,0.276129,-0.160378,0.251298,0.509076
1,Wc81vGp8qZ,A Look at the Health of the Chinese Economy,"Tim Haywood, investment director business-unit...",Bloomberg,2008-09-20 00:00:00,-1,-1,-1,0.208333,-0.156386,...,-0.463229,-0.552598,-0.715039,-0.543524,-0.573413,0.441098,-0.133821,0.311327,-0.492976,0.150883
2,zNGH03CrZH,Nouriel Roubini: Global Economy Not Back to 2008,"Nouriel Roubini, NYU professor and chairman at...",Bloomberg,2012-01-28 00:00:00,-1,-1,-1,-0.42521,0.139754,...,-0.193322,-0.091477,-0.482368,-0.547905,-0.584852,0.094878,-0.203264,-0.013735,-0.710345,0.172799
3,3sM1H0W8ts,Finland GDP Expands In Q4,Finland's economy expanded marginally in the t...,RTT News,2015-03-01 00:06:00,-1,-1,-1,0.0,0.026064,...,-0.260081,-0.378379,0.136987,-0.115915,0.4568,0.53322,-0.063789,-0.045482,-0.551651,0.003335
4,wUbnxgvqaZ,"Tourism, govt spending buoys Thai economy in J...",Tourism and public spending continued to boost...,The Nation - Thailand&#39;s English news,2015-03-01 00:11:00,-1,-1,-1,0.0,0.141084,...,-0.181461,-0.382395,-0.453737,-0.276794,0.013191,0.278046,-0.331133,0.094336,-0.628746,0.057609


In [23]:
df2.columns

Index(['IDLink', 'Title', 'Headline', 'Source', 'PublishDate', 'Facebook',
       'GooglePlus', 'LinkedIn', 'SentimentTitle', 'SentimentHeadline',
       ...
       'h91', 'h92', 'h93', 'h94', 'h95', 'h96', 'h97', 'h98', 'h99', 'h100'],
      dtype='object', length=216)

## Train test split

In [26]:
Y = df2[['SentimentTitle','SentimentHeadline']]
X = df2.copy()

In [27]:
X.drop('SentimentTitle',inplace=True,axis=1)
X.drop('SentimentHeadline',inplace=True,axis=1)
X.drop('Title',inplace=True,axis=1)
X.drop('Headline',inplace=True,axis=1)
X.drop('Source',inplace=True,axis=1)
X.drop('PublishDate',inplace=True,axis=1)
X.drop('title_embedding',inplace=True,axis=1)
X.drop('headline_embedding',inplace=True,axis=1)
X.drop('IDLink',inplace=True,axis=1)

try:
    X.drop('title_text',inplace=True,axis=1)
    X.drop('headline_text',inplace=True,axis=1)
except Exception as e:
    pass

In [28]:
X.head()

Unnamed: 0,Facebook,GooglePlus,LinkedIn,Topic_economy,Topic_microsoft,Topic_obama,Topic_palestine,t1,t2,t3,...,h91,h92,h93,h94,h95,h96,h97,h98,h99,h100
0,-1,-1,-1,0,0,1,0,-0.098636,0.633668,-0.69392,...,-0.543032,-0.254333,-0.232355,0.131315,-0.09142,0.867266,0.276129,-0.160378,0.251298,0.509076
1,-1,-1,-1,1,0,0,0,-0.21868,0.177207,0.278337,...,-0.463229,-0.552598,-0.715039,-0.543524,-0.573413,0.441098,-0.133821,0.311327,-0.492976,0.150883
2,-1,-1,-1,1,0,0,0,0.187596,0.395501,-0.322583,...,-0.193322,-0.091477,-0.482368,-0.547905,-0.584852,0.094878,-0.203264,-0.013735,-0.710345,0.172799
3,-1,-1,-1,1,0,0,0,-0.530583,0.187374,0.015022,...,-0.260081,-0.378379,0.136987,-0.115915,0.4568,0.53322,-0.063789,-0.045482,-0.551651,0.003335
4,-1,-1,-1,1,0,0,0,-0.154024,0.447604,-0.103433,...,-0.181461,-0.382395,-0.453737,-0.276794,0.013191,0.278046,-0.331133,0.094336,-0.628746,0.057609


In [29]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

In [30]:
X[list(X.columns)] = scaler.fit_transform(X[list(X.columns)])

In [31]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.2,random_state=23)

In [32]:
main_columns = ['Facebook','GooglePlus',
                'LinkedIn','Topic_economy', 'Topic_microsoft', 
                'Topic_obama','Topic_palestine']


### Sentiment Title

In [33]:
select_title_columns = main_columns+title_columns
select_headline_columns = main_columns+headline_columns



In [34]:
X_train_title = X_train[select_title_columns]
X_test_title = X_test[select_title_columns]

Y_train_title = Y_train['SentimentTitle']
Y_test_title = Y_test['SentimentTitle']

In [35]:
X_train_headline = X_train[select_headline_columns]
X_test_headline = X_test[select_headline_columns]
Y_train_headline = Y_train['SentimentHeadline']
Y_test_headline = Y_test['SentimentHeadline']

In [36]:
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
# from sklearn.metrics import roc_auc_score,roc_curve
# from sklearn.metrics import confusion_matrix, classification_report
# import matplotlib.pyplot as pyplot

In [37]:
lr = LinearRegression()
lr.fit(X_train_title,Y_train_title)

lr.score(X_train_title,Y_train_title)

0.12545072871832552

In [38]:
lr = LinearRegression()
lr.fit(X_train_headline,Y_train_headline)

lr.score(X_train_headline,Y_train_headline)

0.11062393874112009

In [40]:
rf_title = RandomForestRegressor(n_estimators=200,criterion='mse',max_features='sqrt',verbose=1)

rf_title.fit(X_train_title,Y_train_title)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 200 out of 200 | elapsed:  2.1min finished


RandomForestRegressor(max_features='sqrt', n_estimators=200, verbose=1)

In [41]:
rf_headline = RandomForestRegressor(n_estimators=200,criterion='mse',max_features='sqrt',verbose=1)

rf_headline.fit(X_train_headline,Y_train_headline)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 200 out of 200 | elapsed:  1.9min finished


RandomForestRegressor(max_features='sqrt', n_estimators=200, verbose=1)

In [42]:
y_test_predict_title = rf_title.predict(X_test_title)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 200 out of 200 | elapsed:    0.9s finished


In [43]:
rf_title.score(X_test_title,Y_test_title)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 200 out of 200 | elapsed:    0.8s finished


0.3156266441286155

In [44]:
rf_headline.score(X_test_headline,Y_test_headline)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 200 out of 200 | elapsed:    0.9s finished


0.21270182809263072

## MAE error in train test data

In [45]:
y_test_predict_title = rf_title.predict(X_test_title)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 200 out of 200 | elapsed:    0.8s finished


In [46]:
y_test_predict_headline = rf_headline.predict(X_test_headline)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 200 out of 200 | elapsed:    0.8s finished


In [47]:
import math

In [48]:
np.mean(abs(Y_test_title-y_test_predict_title))

0.07670906559849482

In [49]:
np.mean(abs(Y_test_headline-y_test_predict_headline))

0.09387661345913298

In [50]:
max(0,1-(0.4*np.mean(abs(Y_test_title-y_test_predict_title))+0.6*np.mean(abs(Y_test_headline-y_test_predict_headline))))

0.9129904056851222

## Test Data

In [68]:
title_columns = ['t'+str(i) for i in range(1,101)]
headline_columns = ['h'+str(i) for i in range(1,101)]

In [69]:
test_df = pd.read_csv('dataset/test_file.csv')

In [70]:
test_df = pd.get_dummies(data=test_df,columns=['Topic'])

In [71]:
test_df['title_embedding']  = test_df.apply(lambda x: sentence_embedding(x['Title']),axis=1)

test_df['headline_embedding']  = test_df.apply(lambda x: sentence_embedding(x['Headline']),axis=1)

In [72]:
title_dataframe_test = pd.DataFrame(test_df['title_embedding'].to_list(), columns = title_columns)
headline_dataframe_test = pd.DataFrame(test_df['headline_embedding'].to_list(), columns = headline_columns)

In [73]:
test_df1 = pd.concat([test_df,title_dataframe_test,headline_dataframe_test],axis=1)

In [74]:
test_df1_title = test_df1[select_title_columns]
test_df1_headline = test_df1[select_headline_columns]

In [75]:
test_df1_title.head(2)

Unnamed: 0,Facebook,GooglePlus,LinkedIn,Topic_economy,Topic_microsoft,Topic_obama,Topic_palestine,t1,t2,t3,...,t91,t92,t93,t94,t95,t96,t97,t98,t99,t100
0,0,0,1,1,0,0,0,-0.170861,0.585554,-0.347407,...,-0.367294,-0.289588,-0.365901,-0.303176,-0.062653,0.616139,-0.178627,-0.131225,-0.563069,-0.443287
1,121,2,13,0,1,0,0,0.140719,-0.166838,0.648721,...,-0.425418,-0.816827,-0.902377,0.103103,-0.000711,-0.611387,0.622406,-0.315822,0.543858,0.718103


In [76]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

In [77]:
test_df1_title[select_title_columns] = scaler.fit_transform(test_df1_title[select_title_columns])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df1_title[select_title_columns] = scaler.fit_transform(test_df1_title[select_title_columns])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value[:, i].tolist())


In [78]:
test_df1_headline[select_headline_columns] = scaler.fit_transform(test_df1_headline[select_headline_columns])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df1_headline[select_headline_columns] = scaler.fit_transform(test_df1_headline[select_headline_columns])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value[:, i].tolist())


In [79]:
test_df1_headline.head(2)

Unnamed: 0,Facebook,GooglePlus,LinkedIn,Topic_economy,Topic_microsoft,Topic_obama,Topic_palestine,h1,h2,h3,...,h91,h92,h93,h94,h95,h96,h97,h98,h99,h100
0,6e-05,0.000983,9.8e-05,1.0,0.0,0.0,0.0,0.459887,0.331097,0.487324,...,0.57267,0.434023,0.700565,0.669624,0.573498,0.76733,0.587888,0.7124,0.519887,0.250279
1,0.00735,0.00295,0.000688,0.0,1.0,0.0,0.0,0.564693,0.281043,0.656578,...,0.653173,0.280203,0.406907,0.490932,0.420343,0.496176,0.746641,0.609433,0.692935,0.419768


In [80]:
rf_title.predict(test_df1_title)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.7s finished


array([-0.07091009, -0.03334267, -0.05220521, ..., -0.02933717,
       -0.04341188, -0.00927376])

In [81]:
rf_headline.predict(test_df1_headline)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.7s finished


array([-0.01613106, -0.00335052, -0.01490469, ..., -0.03522935,
       -0.03817481, -0.0331324 ])

In [82]:
test_df1['SentimentTitle'] = rf_title.predict(test_df1_title)

test_df1['SentimentHeadline'] = rf_headline.predict(test_df1_headline)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.7s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.7s finished


In [83]:
final_df = test_df1[['IDLink','SentimentTitle','SentimentHeadline']]

In [84]:
final_df.head()

Unnamed: 0,IDLink,SentimentTitle,SentimentHeadline
0,tFrqIR6Chj,-0.07091,-0.016131
1,DVAaGErjlF,-0.033343,-0.003351
2,OT9UIZm5M2,-0.052205,-0.014905
3,lflGp3q2Fj,-0.001523,-0.049912
4,zDYG0SoovZ,-0.015058,-0.019616


In [85]:
final_df.to_csv('sentiment_analysis_submission.csv',index=False)