In [45]:
import numpy as np
import pandas as pd
from collections import OrderedDict
import re
from num2words import num2words
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

In [4]:
corpus = ['Apple Orange Orange Apple','Apple Banana Apple Banana','Banana Apple Banana Banana Banana Apple',\
          'Banana Orange Banana Banana Orange Banana','Banana Apple Banana Banana Orange Banana']

In [5]:
''' Create a function to vectorize the counts of a list of sentences. Does not ignore case.'''
def myCountVec(corpus):
    bag = set()  # use a set to figure out how many unique words there are
    word_index = dict() # use a dict to store the column index of the word
    for line in corpus:
        for word in line.split():
            if word in bag:
                continue
            else:
                bag.add(word)
    list_of_words = sorted(list(bag)) # sort the list of words derived from set
    for index,word in enumerate(list_of_words):
        word_index[word] = index  # assign index to word
    vec_of_counts = np.zeros((len(corpus),len(word_index))) # create dummy matrix with all locations initilised to 0
    for row,line in enumerate(corpus):
        for word in line.split():
            col = word_index[word]
            vec_of_counts[row][col] += 1  # populate the matrix, increasing a word count when needed.
    return vec_of_counts

In [6]:
myCountVec(corpus)

array([[2., 0., 2.],
       [2., 2., 0.],
       [2., 4., 0.],
       [0., 4., 2.],
       [1., 4., 1.]])

In [7]:
df = pd.read_csv('MrTrumpSpeeches.csv', sep='~', encoding='latin1')
df['sentiment'] = np.where(df['like_count'] > df['dislike_count'], 1, 0)
df.head()

Unnamed: 0,id,playlist,upload_date,title,view_count,average_rating,like_count,dislike_count,subtitles,sentiment
0,-2WTNSujhjk,Donald Trump Speeches & Events,20160220,Live Stream: Donald Trump Victory Rally in Spa...,4057.0,4.259259,44.0,10.0,presidents of the United States mr. go tr...,1
1,-64nfy6i58w,Donald Trump Speeches & Events,20161107,LAST RALLY: Donald Trump FINAL CAMPAIGN Rally ...,47276.0,4.358025,952.0,182.0,it's now officially Tuesday November a di...,1
2,-7Sp31hTxkU,Donald Trump Speeches & Events,20160423,"FULL SPEECH: Donald Trump Rally in Bridgeport,...",19966.0,4.666667,220.0,20.0,you [Music] [Music] [Music] you I...,1
3,-byuyavcNI4,Donald Trump Speeches & Events,20160617,"Full Speech: Donald Trump Rally in Houston, Te...",15138.0,4.582491,266.0,31.0,we welcome stars and president [Music] ...,1
4,09BXh-AA72M,Donald Trump Speeches & Events,20161105,"Full Speech: Donald Trump Rally in Denver, Col...",8720.0,4.924731,365.0,7.0,you thank you [Music] great people Gr...,1


In [9]:
# before we clean, lets have a look at a word dictionary to see if we can spot some odd things
D = {}
for line in df['subtitles']:
    for word in line.split():
        if word.lower() in D:
            D[word.lower()] += 1
        else:
            D[word.lower()] = 1
D1 = OrderedDict(sorted(D.items()))
D1.items()



In [43]:
# so there are some genral punctuation marks and brackets that should be removed since they dont add anything to the context.
# there also are quite a few words that are numbers. We will try to change these to text using num2words
def cleaningFunc(line):
    line = re.sub('[\/\+\-,:.\'\$*%\[\]\&]','',line)
    words = line.split()
    newwords = []
    for word in words:
        if word.isdigit():
            newwords.append(re.sub('[ ,-]','',num2words(int(word))))
        else:
            newwords.append(word)
    return " ".join(newwords)

In [44]:
df['subtitle_clean'] = df['subtitles'].apply(lambda x : cleaningFunc(x))
D = {}
for line in df['subtitle_clean']:
    for word in line.split():
        if word.lower() in D:
            D[word.lower()] += 1
        else:
            D[word.lower()] = 1
D1 = OrderedDict(sorted(D.items()))
D1.items()



In [38]:
# ok so thats our dictioanry of words. Lets pass it into CountVectorizer and TDIDFVEctorizer


1