In [36]:
import numpy as np
import pandas as pd
from collections import OrderedDict
import re
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

In [2]:
corpus = ['Apple Orange Orange Apple','Apple Banana Apple Banana','Banana Apple Banana Banana Banana Apple',\
          'Banana Orange Banana Banana Orange Banana','Banana Apple Banana Banana Orange Banana']

In [3]:
''' Create a function to vectorize the counts of a list of sentences. Does not ignore case.'''
def myCountVec(corpus):
    bag = set()  # use a set to figure out how many unique words there are
    word_index = dict() # use a dict to store the column index of the word
    for line in corpus:
        for word in line.split():
            if word in bag:
                continue
            else:
                bag.add(word)
    list_of_words = sorted(list(bag)) # sort the list of words derived from set
    for index,word in enumerate(list_of_words):
        word_index[word] = index  # assign index to word
    vec_of_counts = np.zeros((len(corpus),len(word_index))) # create dummy matrix with all locations initilised to 0
    for row,line in enumerate(corpus):
        for word in line.split():
            col = word_index[word]
            vec_of_counts[row][col] += 1  # populate the matrix, increasing a word count when needed.
    return vec_of_counts

In [4]:
myCountVec(corpus)

array([[2., 0., 2.],
       [2., 2., 0.],
       [2., 4., 0.],
       [0., 4., 2.],
       [1., 4., 1.]])

In [6]:
df = pd.read_csv('MrTrumpSpeeches.csv', sep='~', encoding='latin1')
df['sentiment'] = np.where(df['like_count'] > df['dislike_count'], 1, 0)
df.head()

Unnamed: 0,id,playlist,upload_date,title,view_count,average_rating,like_count,dislike_count,subtitles,sentiment
0,-2WTNSujhjk,Donald Trump Speeches & Events,20160220,Live Stream: Donald Trump Victory Rally in Spa...,4057.0,4.259259,44.0,10.0,presidents of the United States mr. go tr...,1
1,-64nfy6i58w,Donald Trump Speeches & Events,20161107,LAST RALLY: Donald Trump FINAL CAMPAIGN Rally ...,47276.0,4.358025,952.0,182.0,it's now officially Tuesday November a di...,1
2,-7Sp31hTxkU,Donald Trump Speeches & Events,20160423,"FULL SPEECH: Donald Trump Rally in Bridgeport,...",19966.0,4.666667,220.0,20.0,you [Music] [Music] [Music] you I...,1
3,-byuyavcNI4,Donald Trump Speeches & Events,20160617,"Full Speech: Donald Trump Rally in Houston, Te...",15138.0,4.582491,266.0,31.0,we welcome stars and president [Music] ...,1
4,09BXh-AA72M,Donald Trump Speeches & Events,20161105,"Full Speech: Donald Trump Rally in Denver, Col...",8720.0,4.924731,365.0,7.0,you thank you [Music] great people Gr...,1


In [7]:
# lets check some characteristics of the datframe
df.shape

(836, 10)

In [8]:
# ok so 836 rows by 10 columns. Lets check the numeric columns for issues
df.describe()

Unnamed: 0,upload_date,view_count,average_rating,like_count,dislike_count,sentiment
count,836.0,836.0,836.0,836.0,836.0,836.0
mean,20165260.0,24782.65,4.345376,288.478469,56.348086,0.912679
std,4783.116,60209.21,0.702294,498.063763,188.722867,0.282473
min,20160200.0,71.0,1.347826,2.0,0.0,0.0
25%,20160810.0,1788.0,4.263935,29.0,4.0,1.0
50%,20161100.0,8775.5,4.589107,122.0,19.0,1.0
75%,20170310.0,22709.0,4.782065,358.25,47.0,1.0
max,20170800.0,1013054.0,5.0,6143.0,3544.0,1.0


In [9]:
# ok so far so good. Lets check the non numeric columns
for col in ['id','playlist','title','subtitles']:
    print(df[col].describe())

count             836
unique            836
top       qi6UCC_L0WU
freq                1
Name: id, dtype: object
count                                                   836
unique                                                    5
top       Donald Trump Rally Speech Events Press Confere...
freq                                                    369
Name: playlist, dtype: object
count                                                   836
unique                                                  834
top       WATCH: President Donald Trump Press Conference...
freq                                                      2
Name: title, dtype: object
count                                                   836
unique                                                  836
top          my fellow Americans it's an exciting   time...
freq                                                      1
Name: subtitles, dtype: object


In [10]:
# so looks like we have all the data. Lets check for missing data and nans
df.isnull().any()

id                False
playlist          False
upload_date       False
title             False
view_count        False
average_rating    False
like_count        False
dislike_count     False
subtitles         False
sentiment         False
dtype: bool

In [31]:
# so there are some genral punctuation marks that should be removed since they dont add anything to the context.
# Some word are enclosed inside [] indicating environment and not really contributing to the sentiment anaylysis
def cleaningFunc(line):
    line = re.sub('[\/\+\-,:.\'\$*%\&]','',line) # remove puctuation marks etc.
    line = line.lower() # convert everthing to lowercase
    words = line.split() # split into words
    newwords = []
    for word in words:
        if "[" in word: # skip words enclosed in brackets
            continue
        word = word.strip(' ') # remove any trailing or leading spaces
        newwords.append(word)
    return " ".join(newwords) # return sentence with word seperated by a single space

In [32]:
df['subtitle_clean'] = df['subtitles'].apply(lambda x : cleaningFunc(x))
cvecs = CountVectorizer(stop_words='english')
Xcv = cvecs.fit_transform(df['subtitle_clean'].values)
print("Count matrix shape :", Xcv.shape)
tfvecs = TfidfVectorizer(max_df=0.8,stop_words='english') # only keep words that are less than 80% repeated in document
Xtf = tfvecs.fit_transform(df['subtitle_clean'].values)
print("Tfidf matrix shape :", Xtf.shape)
df.head()

Count matrix shape : (836, 23623)
Tfidf matrix shape : (836, 23620)


Unnamed: 0,id,playlist,upload_date,title,view_count,average_rating,like_count,dislike_count,subtitles,sentiment,subtitle_clean
0,-2WTNSujhjk,Donald Trump Speeches & Events,20160220,Live Stream: Donald Trump Victory Rally in Spa...,4057.0,4.259259,44.0,10.0,presidents of the United States mr. go tr...,1,presidents of the united states mr go trapp fa...
1,-64nfy6i58w,Donald Trump Speeches & Events,20161107,LAST RALLY: Donald Trump FINAL CAMPAIGN Rally ...,47276.0,4.358025,952.0,182.0,it's now officially Tuesday November a di...,1,its now officially tuesday november a did you ...
2,-7Sp31hTxkU,Donald Trump Speeches & Events,20160423,"FULL SPEECH: Donald Trump Rally in Bridgeport,...",19966.0,4.666667,220.0,20.0,you [Music] [Music] [Music] you I...,1,you you i you the greatness of our country as ...
3,-byuyavcNI4,Donald Trump Speeches & Events,20160617,"Full Speech: Donald Trump Rally in Houston, Te...",15138.0,4.582491,266.0,31.0,we welcome stars and president [Music] ...,1,we welcome stars and president all over the pl...
4,09BXh-AA72M,Donald Trump Speeches & Events,20161105,"Full Speech: Donald Trump Rally in Denver, Col...",8720.0,4.924731,365.0,7.0,you thank you [Music] great people Gr...,1,you thank you great people granna before in th...


In [75]:
X_train,X_test,y_train,y_test = train_test_split(Xcv,df['sentiment'],test_size=0.2,random_state=0)
logistic = LogisticRegression()
logistic.fit(X_train,y_train)
y_pred = logistic.predict(X_test)
print("F1 score is : {}".format(f1_score(y_test,y_pred)))

F1 score is : 0.9426751592356688


In [76]:
X2_train,X2_test,y2_train,y2_test = train_test_split(Xtf,df['sentiment'],test_size=0.2,random_state = 0)
logistic2 = LogisticRegression()
logistic2.fit(X2_train,y2_train)
y2_pred = logistic2.predict(X2_test)
print("F1 score is : {}".format(f1_score(y2_test,y2_pred)))

F1 score is : 0.9597523219814241


In [77]:
tfvecsngram = TfidfVectorizer(max_df=0.8,stop_words='english',ngram_range=(1,3)) # only keep words that are less than 80% repeated in document
Xtfng = tfvecs.fit_transform(df['subtitle_clean'].values)

In [78]:
X3_train,X3_test,y3_train,y3_test = train_test_split(Xtfng,df['sentiment'],test_size=0.2,random_state = 0)
logistic3 = LogisticRegression()
logistic3.fit(X3_train,y3_train)
y3_pred = logistic3.predict(X3_test)
print("F1 score is : {}".format(f1_score(y3_test,y3_pred)))

F1 score is : 0.9597523219814241


In [81]:
supvec = LinearSVC()
supvec.fit(X_train,y_train)
y4_pred = supvec.predict(X_test)
print("F1 score is : {}".format(f1_score(y_test,y4_pred)))

F1 score is : 0.9290322580645162


In [82]:
supvec2 = LinearSVC()
supvec2.fit(X2_train,y2_train)
y5_pred = supvec2.predict(X2_test)
print("F1 score is : {}".format(f1_score(y2_test,y5_pred)))

F1 score is : 0.9597523219814241


In [83]:
supvec3 = LinearSVC()
supvec3.fit(X3_train,y3_train)
y6_pred = supvec3.predict(X3_test)
print("F1 score is : {}".format(f1_score(y3_test,y6_pred)))

F1 score is : 0.9597523219814241
