In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
%reload_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
from sklearn.cluster import MiniBatchKMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

In [3]:
import gensim

In [4]:
df = pd.read_csv('dataset/train_file.csv')


In [5]:
df.head(2)

Unnamed: 0,IDLink,Title,Headline,Source,Topic,PublishDate,Facebook,GooglePlus,LinkedIn,SentimentTitle,SentimentHeadline
0,Tr3CMgRv1N,Obama Lays Wreath at Arlington National Cemetery,Obama Lays Wreath at Arlington National Cemete...,USA TODAY,obama,2002-04-02 00:00:00,-1,-1,-1,0.0,-0.0533
1,Wc81vGp8qZ,A Look at the Health of the Chinese Economy,"Tim Haywood, investment director business-unit...",Bloomberg,economy,2008-09-20 00:00:00,-1,-1,-1,0.208333,-0.156386


In [6]:
df['Source'].nunique()

4753

In [7]:
df.dropna(subset=['IDLink'],inplace=True)

In [8]:
df.count()

IDLink               55932
Title                55932
Headline             55932
Source               55757
Topic                55932
PublishDate          55932
Facebook             55932
GooglePlus           55932
LinkedIn             55932
SentimentTitle       55932
SentimentHeadline    55932
dtype: int64

## preprocessing data

In [9]:
df1 = df

In [10]:
import string
from gensim.parsing.preprocessing import remove_stopwords

def remove_punctuation(input_str):
    punctuation_free = "".join(i for i in input_str if i not in string.punctuation)
    return punctuation_free


In [11]:
from nltk.stem import WordNetLemmatizer
#defining the object for Lemmatization
wordnet_lemmatizer = WordNetLemmatizer()
#defining the function for lemmatization
def lemmatizer(text):
    lemm_text = [wordnet_lemmatizer.lemmatize(word) for word in text]
    return lemm_text


In [12]:
import nltk

In [13]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/b0206395/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [14]:
def pre_process_data(input_str):
    input_str = remove_punctuation(input_str)
    input_str = remove_stopwords(input_str)
    input_str = gensim.utils.simple_preprocess(input_str)
    input_str = lemmatizer(input_str)
    return input_str

def feature_importance(df,model):
    columns = df.columns

    model.feature_importances_

    return dict(zip(columns,model.feature_importances_))

In [None]:
# df1['all_text'] = df1['Title']+df1['Headline']

# df1['all_text_processed'] = df1['all_text'].apply(lambda x: pre_process_data(x))

# df1.head(2)

# build model

In [None]:
df1['title_text'] = df1['Title'].apply(lambda x: pre_process_data(x))
df1['headline_text'] = df1['Headline'].apply(lambda x: pre_process_data(x))

In [None]:
template_text1 = df1.title_text
template_text2 = df1.headline_text

template_text = template_text1.append(template_text2)

In [None]:
len(template_text)

In [None]:
template_text

In [None]:
df1.shape

In [None]:
from gensim.models import FastText

model = FastText(window=5, 
                min_count=5,
                workers=4,
                )

# build vocab first
model.build_vocab(template_text, progress_per=100)

model.epochs

model.train(template_text,total_examples=model.corpus_count,epochs=model.epochs)

model.save("word2_vec_fast_text.model")

In [None]:
model.wv.most_similar("obama")

In [None]:
model.wv.most_similar('govt')

In [None]:
model.wv.get_vector('member')

# Load model

In [15]:
from gensim.models import Word2Vec

model = Word2Vec.load("word2_vec_fast_text.model")

In [16]:
def sentence_embedding(sentence,model=model):
    word_list = pre_process_data(sentence)
    word_list = [each for each in word_list if each in model.wv.key_to_index]
    sentence_vec = np.zeros((100,),dtype='float32')
    sentence_len= len(word_list)
    for each in word_list:
        each_vector = model.wv.get_vector(each)
        sentence_vec = np.add(sentence_vec,each_vector)
    if sentence_len>0:
        sentence_vec = sentence_vec/sentence_len
    return sentence_vec

In [17]:
def gen_similarity_score(sent1,sent2):
    sent1_embedding = sentence_embedding(sent1)
    sent2_embedding = sentence_embedding(sent2)
    return 1- spatial.distance.cosine(sent1_embedding,sent2_embedding)

## creating text embedding for title and headline text

In [18]:
df1['title_embedding']  = df1.apply(lambda x: sentence_embedding(x['Title']),axis=1)

df1['headline_embedding']  = df1.apply(lambda x: sentence_embedding(x['Headline']),axis=1)

## categorical encoding

In [19]:
df1 = pd.get_dummies(data=df1,columns=['Topic'])

In [20]:
df1.columns

Index(['IDLink', 'Title', 'Headline', 'Source', 'PublishDate', 'Facebook',
       'GooglePlus', 'LinkedIn', 'SentimentTitle', 'SentimentHeadline',
       'title_embedding', 'headline_embedding', 'Topic_economy',
       'Topic_microsoft', 'Topic_obama', 'Topic_palestine'],
      dtype='object')

In [21]:
# TODO: use publish date to identify age of news in days/week/months.

In [22]:
title_columns = ['t'+str(i) for i in range(1,101)]
headline_columns = ['h'+str(i) for i in range(1,101)]

In [23]:
title_dataframe = pd.DataFrame(df1['title_embedding'].to_list(), columns = title_columns)
headline_dataframe = pd.DataFrame(df1['headline_embedding'].to_list(), columns = headline_columns)

In [24]:
title_dataframe.head(1)

Unnamed: 0,t1,t2,t3,t4,t5,t6,t7,t8,t9,t10,...,t91,t92,t93,t94,t95,t96,t97,t98,t99,t100
0,-0.259122,0.743215,-0.608792,0.250443,0.175426,-0.074218,0.7744,0.128739,1.030575,-1.091983,...,-0.452373,-0.051135,0.230236,0.180493,-0.088765,0.78054,0.476393,-0.464559,0.16577,0.35524


In [25]:
headline_dataframe.head(1)

Unnamed: 0,h1,h2,h3,h4,h5,h6,h7,h8,h9,h10,...,h91,h92,h93,h94,h95,h96,h97,h98,h99,h100
0,-0.071621,0.583369,-0.637146,0.084336,0.115242,0.055463,0.515422,-0.00683,0.851999,-0.84087,...,-0.565439,-0.141013,-0.133866,0.20046,-0.45326,1.006475,0.572488,-0.646297,0.4838,0.750966


In [26]:
df_sentiment_title = pd.concat([df1[['IDLink','Facebook','GooglePlus',
                                     'LinkedIn','Topic_economy', 'Topic_microsoft', 
                                     'Topic_obama','Topic_palestine','SentimentTitle']], title_dataframe], axis=1)

In [27]:
df_sentiment_headline = pd.concat([df1[['IDLink','Facebook','GooglePlus',
                                        'LinkedIn','Topic_economy', 'Topic_microsoft',
                                        'Topic_obama','Topic_palestine',
                                        'SentimentHeadline']], headline_dataframe], axis=1)

In [28]:
df_sentiment_title.columns

Index(['IDLink', 'Facebook', 'GooglePlus', 'LinkedIn', 'Topic_economy',
       'Topic_microsoft', 'Topic_obama', 'Topic_palestine', 'SentimentTitle',
       't1',
       ...
       't91', 't92', 't93', 't94', 't95', 't96', 't97', 't98', 't99', 't100'],
      dtype='object', length=109)

In [29]:
df_sentiment_headline.columns

Index(['IDLink', 'Facebook', 'GooglePlus', 'LinkedIn', 'Topic_economy',
       'Topic_microsoft', 'Topic_obama', 'Topic_palestine',
       'SentimentHeadline', 'h1',
       ...
       'h91', 'h92', 'h93', 'h94', 'h95', 'h96', 'h97', 'h98', 'h99', 'h100'],
      dtype='object', length=109)


# Regression model building

### Sentiment Title

In [79]:
id_var = ['IDLink','SentimentTitle','SentimentHeadline']

In [80]:
model_title_columns = list(set(list(df_sentiment_title.columns)) - set(id_var))

In [81]:
columns_to_scale = model_title_columns

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

df_sentiment_title[columns_to_scale] = scaler.fit_transform(df_sentiment_title[columns_to_scale])

In [52]:
Y = df_sentiment_title['SentimentTitle']
X = df_sentiment_title.copy()
X.drop('SentimentTitle',inplace=True,axis=1)
X.drop('IDLink',inplace=True,axis=1)

In [53]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.2,random_state=23)

In [54]:
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
# from sklearn.metrics import roc_auc_score,roc_curve
# from sklearn.metrics import confusion_matrix, classification_report
# import matplotlib.pyplot as pyplot

In [55]:
lr = LinearRegression()
lr.fit(X_train,Y_train)

LinearRegression()

In [56]:
lr.score(X_train,Y_train)

0.12245163119746827

In [66]:
rf = RandomForestRegressor(n_estimators=100,criterion='mse',max_features='sqrt',verbose=2)

In [67]:
rf.fit(X_train,Y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


building tree 1 of 100


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.6s remaining:    0.0s


building tree 2 of 100
building tree 3 of 100
building tree 4 of 100
building tree 5 of 100
building tree 6 of 100
building tree 7 of 100
building tree 8 of 100
building tree 9 of 100
building tree 10 of 100
building tree 11 of 100
building tree 12 of 100
building tree 13 of 100
building tree 14 of 100
building tree 15 of 100
building tree 16 of 100
building tree 17 of 100
building tree 18 of 100
building tree 19 of 100
building tree 20 of 100
building tree 21 of 100
building tree 22 of 100
building tree 23 of 100
building tree 24 of 100
building tree 25 of 100
building tree 26 of 100
building tree 27 of 100
building tree 28 of 100
building tree 29 of 100
building tree 30 of 100
building tree 31 of 100
building tree 32 of 100
building tree 33 of 100
building tree 34 of 100
building tree 35 of 100
building tree 36 of 100
building tree 37 of 100
building tree 38 of 100
building tree 39 of 100
building tree 40 of 100
building tree 41 of 100
building tree 42 of 100
building tree 43 of 100


[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:   54.5s finished


RandomForestRegressor(max_features='sqrt', verbose=2)

In [68]:
y_test_predict = rf.predict(X_test)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.8s finished


In [69]:
rf.score(X_test,Y_test)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.4s finished


0.3119681053163076

### Sentiment headline

In [100]:
id_var = ['IDLink','SentimentHeadline','SentimentTitle']

In [101]:
model_headline_columns = list(set(list(df_sentiment_headline.columns)) - set(id_var))

In [102]:
columns_to_scale = model_headline_columns

In [103]:
df_sentiment_headline[columns_to_scale] = scaler.fit_transform(df_sentiment_headline[columns_to_scale])

In [104]:
Y_h = df_sentiment_headline['SentimentHeadline']
X_h = df_sentiment_headline.copy()
X_h.drop('SentimentHeadline',inplace=True,axis=1)
X_h.drop('IDLink',inplace=True,axis=1)

In [105]:
from sklearn.model_selection import train_test_split
Xh_train, Xh_test, Yh_train, Yh_test = train_test_split(X_h,Y_h,test_size=0.2,random_state=23)

In [106]:
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
# from sklearn.metrics import roc_auc_score,roc_curve
# from sklearn.metrics import confusion_matrix, classification_report
# import matplotlib.pyplot as pyplot

rf_headline = RandomForestRegressor(n_estimators=100,criterion='mse',max_features='sqrt',verbose=2)

In [107]:
rf_headline.fit(Xh_train,Yh_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


building tree 1 of 100


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.5s remaining:    0.0s


building tree 2 of 100
building tree 3 of 100
building tree 4 of 100
building tree 5 of 100
building tree 6 of 100
building tree 7 of 100
building tree 8 of 100
building tree 9 of 100
building tree 10 of 100
building tree 11 of 100
building tree 12 of 100
building tree 13 of 100
building tree 14 of 100
building tree 15 of 100
building tree 16 of 100
building tree 17 of 100
building tree 18 of 100
building tree 19 of 100
building tree 20 of 100
building tree 21 of 100
building tree 22 of 100
building tree 23 of 100
building tree 24 of 100
building tree 25 of 100
building tree 26 of 100
building tree 27 of 100
building tree 28 of 100
building tree 29 of 100
building tree 30 of 100
building tree 31 of 100
building tree 32 of 100
building tree 33 of 100
building tree 34 of 100
building tree 35 of 100
building tree 36 of 100
building tree 37 of 100
building tree 38 of 100
building tree 39 of 100
building tree 40 of 100
building tree 41 of 100
building tree 42 of 100
building tree 43 of 100


[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:   53.3s finished


RandomForestRegressor(max_features='sqrt', verbose=2)

In [108]:
y_test_predict = rf_headline.predict(Xh_test)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.6s finished


In [109]:
rf_headline.score(Xh_test,Yh_test)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.4s finished


0.20568917446203483

## MAE error in train test data

## Test Data

In [89]:
title_columns = ['t'+str(i) for i in range(1,101)]
headline_columns = ['h'+str(i) for i in range(1,101)]

In [85]:
test_df = pd.read_csv('dataset/test_file.csv')

In [86]:
test_df = pd.get_dummies(data=test_df,columns=['Topic'])

In [90]:
test_df['title_embedding']  = test_df.apply(lambda x: sentence_embedding(x['Title']),axis=1)

test_df['headline_embedding']  = test_df.apply(lambda x: sentence_embedding(x['Headline']),axis=1)

In [91]:
title_dataframe_test = pd.DataFrame(test_df['title_embedding'].to_list(), columns = title_columns)
headline_dataframe_test = pd.DataFrame(test_df['headline_embedding'].to_list(), columns = headline_columns)

title_dataframe_test.head(1)

headline_dataframe_test.head(1)

df_sentiment_title_test = pd.concat([test_df[['IDLink','Facebook','GooglePlus',
                                     'LinkedIn','Topic_economy', 'Topic_microsoft', 
                                     'Topic_obama','Topic_palestine']], title_dataframe_test], axis=1)

df_sentiment_headline_test = pd.concat([test_df[['IDLink','Facebook','GooglePlus',
                                        'LinkedIn','Topic_economy', 'Topic_microsoft',
                                        'Topic_obama','Topic_palestine',
                                        ]], headline_dataframe_test], axis=1)

In [92]:
df_sentiment_title_test.head(2)

Unnamed: 0,IDLink,Facebook,GooglePlus,LinkedIn,Topic_economy,Topic_microsoft,Topic_obama,Topic_palestine,t1,t2,...,t91,t92,t93,t94,t95,t96,t97,t98,t99,t100
0,tFrqIR6Chj,0,0,1,1,0,0,0,0.05377,0.374439,...,-0.142073,-0.053525,-0.384578,-0.427172,-0.118124,0.646082,-0.406689,-0.159635,-0.766303,-0.409431
1,DVAaGErjlF,121,2,13,0,1,0,0,0.002932,-0.159017,...,-0.148194,-0.334973,-0.769728,0.217712,-0.030892,-0.663118,0.203932,-0.072182,0.404501,0.349951


In [93]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

In [94]:
model_title_columns

['t8',
 't49',
 't36',
 't44',
 't30',
 't98',
 't31',
 't65',
 't12',
 't95',
 'GooglePlus',
 't29',
 't34',
 't57',
 't64',
 't77',
 't27',
 't37',
 't5',
 't3',
 't94',
 't61',
 'Topic_obama',
 't17',
 't23',
 't25',
 't18',
 't90',
 't41',
 't20',
 'Topic_palestine',
 't11',
 't93',
 't24',
 't13',
 't84',
 't86',
 't99',
 't39',
 't4',
 't66',
 't38',
 't22',
 't6',
 't52',
 't51',
 't43',
 't47',
 't35',
 't54',
 't62',
 't92',
 't45',
 't79',
 't9',
 't14',
 't89',
 't2',
 't16',
 't63',
 't60',
 't71',
 't69',
 't80',
 't58',
 't15',
 't26',
 't73',
 't40',
 't50',
 't78',
 't97',
 't55',
 't33',
 't100',
 't59',
 't48',
 't76',
 't85',
 't53',
 't28',
 't96',
 't1',
 't21',
 't42',
 't83',
 't87',
 't91',
 't72',
 't67',
 't56',
 't70',
 'Topic_microsoft',
 't32',
 't7',
 't68',
 't46',
 'Topic_economy',
 'Facebook',
 'LinkedIn',
 't75',
 't88',
 't74',
 't19',
 't10',
 't81',
 't82']

In [95]:
df_sentiment_title_test[model_title_columns] = scaler.fit_transform(df_sentiment_title_test[model_title_columns])

In [96]:
df_sentiment_headline_test[model_headline_columns] = scaler.fit_transform(df_sentiment_headline_test[model_headline_columns])

In [97]:
df_sentiment_headline_test.head(2)

Unnamed: 0,IDLink,Facebook,GooglePlus,LinkedIn,Topic_economy,Topic_microsoft,Topic_obama,Topic_palestine,h1,h2,...,h91,h92,h93,h94,h95,h96,h97,h98,h99,h100
0,tFrqIR6Chj,6e-05,0.000983,9.8e-05,1.0,0.0,0.0,0.0,0.513054,0.472679,...,0.657906,0.553308,0.761493,0.483861,0.540019,0.735562,0.493735,0.487695,0.624143,0.249772
1,DVAaGErjlF,0.00735,0.00295,0.000688,0.0,1.0,0.0,0.0,0.554669,0.359753,...,0.714427,0.447673,0.494234,0.402271,0.508263,0.518869,0.557427,0.529298,0.721145,0.291542


In [99]:
rf.predict(df_sentiment_title_test[model_title_columns])

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.4s finished


array([-0.05667924, -0.01183426, -0.03487739, ..., -0.00413778,
       -0.01062017, -0.01150855])

In [110]:
rf_headline.predict(df_sentiment_headline_test[model_headline_columns])

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.4s finished


array([-0.01778248, -0.05792901, -0.07004754, ..., -0.01026592,
       -0.04722526, -0.04763362])