# Create Doc Term Matrix

#### Define Corpus (text to be converted)

In [1]:
import pandas as pd
dftext = pd.read_csv("MrgdEvts_w_Setmnt.csv")
Corpus = dftext.text #you can also use df['column_name']


#### define list of words to use as features

In [2]:
#TextSet = import words from https://docs.google.com/spreadsheets/d/1RnJwifFR8LVoPzhDvfeKKO2Vl952iMj56FvFAjkIk6s/edit#gid=0

dfVocab = pd.read_csv("PNVocabList.csv") # this needs to be in the same file as the notebook
Features = dfVocab.word 


We’ll import CountVectorizer from sklearn and instantiate it as an object, similar to how you would with a classifier from sklearn. 

In [3]:
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer(analyzer = 'word', binary=False, vocabulary=Features)

In [4]:
vect.fit(Corpus)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None,
        vocabulary=0           abrogate
1          accession
2             accord
3      accreditation
4        acknowledge
5              adept
6             adhere
7          adversary
8            adviser
9           advocacy
10            agency
11             agent
12        allegiance
13          alli...  vis-a-vis
246         watchdog
247         watchful
248        withdrawn
Name: word, dtype: object)

In [5]:
vect.get_feature_names()

['abrogate',
 'accession',
 'accord',
 'accreditation',
 'acknowledge',
 'adept',
 'adhere',
 'adversary',
 'adviser',
 'advocacy',
 'agency',
 'agent',
 'allegiance',
 'alliance',
 'ally',
 'ambassador',
 'annex',
 'annexation',
 'announce',
 'appeasement',
 'arbitration',
 'attache',
 'autonomous',
 'bargain',
 'barter',
 'bestow',
 'betray',
 'bilateral',
 'border',
 'boycott',
 'breach',
 'broker',
 'cartography',
 'channel',
 'charisma',
 'claim',
 'client',
 'collaboration',
 'collective',
 'commerce',
 'compliance',
 'conciliatory',
 'condemnation',
 'conduct',
 'conflict',
 'confront',
 'conquer',
 'conspiracy',
 'consular',
 'consulate',
 'convention',
 'cooperation',
 'counterpart',
 'courtesy',
 'covert',
 'credential',
 'crisis',
 'cunning',
 'customary',
 'de facto',
 'declaration',
 'declare',
 'defiance',
 'deflect',
 'delegation',
 'denounce',
 'deportment',
 'dialogue',
 'diplomacy',
 'diplomat',
 'diplomatic',
 'diplomatic immunity',
 'displaced person',
 'dispute',
 

In [6]:
dtm = vect.transform(Corpus)
repr(dtm)

"<14841x249 sparse matrix of type '<class 'numpy.int64'>'\n\twith 116618 stored elements in Compressed Sparse Row format>"

In [7]:
print(dtm)

  (0, 14)	1
  (0, 75)	1
  (0, 108)	1
  (0, 186)	5
  (0, 193)	1
  (0, 215)	1
  (0, 217)	1
  (1, 7)	1
  (1, 13)	1
  (1, 14)	1
  (1, 35)	2
  (1, 112)	1
  (1, 114)	3
  (1, 135)	3
  (1, 136)	1
  (1, 186)	1
  (1, 198)	1
  (1, 230)	1
  (2, 10)	3
  (2, 67)	1
  (2, 85)	2
  (2, 192)	1
  (3, 67)	1
  (3, 76)	1
  (3, 122)	4
  :	:
  (14839, 15)	1
  (14839, 17)	1
  (14839, 28)	1
  (14839, 35)	1
  (14839, 42)	1
  (14839, 44)	1
  (14839, 77)	5
  (14839, 113)	1
  (14839, 114)	1
  (14839, 136)	1
  (14839, 170)	1
  (14839, 185)	1
  (14839, 203)	1
  (14839, 209)	2
  (14839, 210)	1
  (14839, 217)	1
  (14840, 16)	1
  (14840, 45)	1
  (14840, 75)	1
  (14840, 114)	1
  (14840, 126)	1
  (14840, 127)	1
  (14840, 130)	1
  (14840, 145)	2
  (14840, 223)	2


In [8]:
pd.DataFrame(dtm.toarray(), columns=vect.get_feature_names())

Unnamed: 0,abrogate,accession,accord,accreditation,acknowledge,adept,adhere,adversary,adviser,advocacy,...,unilateral,upheaval,urge,urgency,verity,violate,vis-a-vis,watchdog,watchful,withdrawn
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
dtm.shape

(14841, 249)

In [47]:
dtm = vect.fit_transform(Corpus)
pd.DataFrame(dtm.toarray(), columns=vect.get_feature_names())

Unnamed: 0,abrogate,accession,accord,accreditation,acknowledge,adept,adhere,adversary,adviser,advocacy,...,unilateral,upheaval,urge,urgency,verity,violate,vis-a-vis,watchdog,watchful,withdrawn
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


####  one hot vectorize

In [19]:
def one_hot_vectorize(corpus):
    from sklearn.feature_extraction.text import CountVectorizer
    from sklearn.preprocessing import Binarizer
    freq = CountVectorizer(analyzer = 'word', binary=False, vocabulary=Features)
    freq_vectors = freq.fit_transform(corpus)
    onehot = Binarizer()
    one_hot_vectors = onehot.fit_transform(freq_vectors.toarray())
    return one_hot_vectors



In [20]:
dtm2 = one_hot_vectorize(Corpus)

In [21]:
dtm2.shape

(14841, 249)

In [34]:
OneHotV = vect.fit_transform(Corpus)
pd.DataFrame(dtm2, columns=vect.get_feature_names())

Unnamed: 0,abrogate,accession,accord,accreditation,acknowledge,adept,adhere,adversary,adviser,advocacy,...,unilateral,upheaval,urge,urgency,verity,violate,vis-a-vis,watchdog,watchful,withdrawn
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [39]:
from sklearn.metrics.pairwise import cosine_similarity
dist = 1 - cosine_similarity(dtm2)
print(dist)

[[  3.33066907e-16   7.72078847e-01   1.00000000e+00 ...,   6.91393300e-01
    9.08330150e-01   8.74011842e-01]
 [  7.72078847e-01  -2.22044605e-16   1.00000000e+00 ...,   1.00000000e+00
    7.80618273e-01   8.99496218e-01]
 [  1.00000000e+00   1.00000000e+00   0.00000000e+00 ...,   1.00000000e+00
    1.00000000e+00   1.00000000e+00]
 ..., 
 [  6.91393300e-01   1.00000000e+00   1.00000000e+00 ...,  -2.22044605e-16
    8.01970491e-01   1.00000000e+00]
 [  9.08330150e-01   7.80618273e-01   1.00000000e+00 ...,   8.01970491e-01
    0.00000000e+00   9.19154792e-01]
 [  8.74011842e-01   8.99496218e-01   1.00000000e+00 ...,   1.00000000e+00
    9.19154792e-01   1.11022302e-16]]


In [22]:
repr(dtm2)

'array([[0, 0, 0, ..., 0, 0, 0],\n       [0, 0, 0, ..., 0, 0, 0],\n       [0, 0, 0, ..., 0, 0, 0],\n       ..., \n       [0, 0, 0, ..., 0, 0, 0],\n       [0, 0, 0, ..., 0, 0, 0],\n       [0, 0, 0, ..., 0, 0, 0]])'

#### TF-IDF term frequency-inverse document frequency 

In [29]:
def sklearn_tfidf_vectorize(corpus):
    from sklearn.feature_extraction.text import TfidfVectorizer
    tfidf = TfidfVectorizer(analyzer = 'word', binary=False, vocabulary=Features)
    tfidf_vectors = tfidf.fit_transform(corpus)
    return tfidf_vectors

In [30]:
dtm3 = sklearn_tfidf_vectorize(Corpus)

In [31]:
dtm3.shape

(14841, 249)

In [38]:
from sklearn.metrics.pairwise import cosine_similarity
dist = 1 - cosine_similarity(dtm3)
print(dist)

[[ -2.22044605e-16   8.65241817e-01   1.00000000e+00 ...,   8.16256975e-01
    9.77990224e-01   9.08169840e-01]
 [  8.65241817e-01   0.00000000e+00   1.00000000e+00 ...,   1.00000000e+00
    8.58472874e-01   9.12571604e-01]
 [  1.00000000e+00   1.00000000e+00  -2.22044605e-16 ...,   1.00000000e+00
    1.00000000e+00   1.00000000e+00]
 ..., 
 [  8.16256975e-01   1.00000000e+00   1.00000000e+00 ...,  -2.22044605e-16
    8.84856289e-01   1.00000000e+00]
 [  9.77990224e-01   8.58472874e-01   1.00000000e+00 ...,   8.84856289e-01
    0.00000000e+00   9.79658571e-01]
 [  9.08169840e-01   9.12571604e-01   1.00000000e+00 ...,   1.00000000e+00
    9.79658571e-01  -2.22044605e-16]]


In [50]:
dtm4 = vect.fit_transform(Corpus)
pd.DataFrame(dtm4.toarray(), columns=vect.get_feature_names())

Unnamed: 0,abrogate,accession,accord,accreditation,acknowledge,adept,adhere,adversary,adviser,advocacy,...,unilateral,upheaval,urge,urgency,verity,violate,vis-a-vis,watchdog,watchful,withdrawn
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [51]:
from sklearn.metrics.pairwise import cosine_similarity
dist = 1 - cosine_similarity(dtm4)
print(dist)

[[ -2.22044605e-16   8.03252249e-01   1.00000000e+00 ...,   8.44457246e-01
    9.72923482e-01   9.53626110e-01]
 [  8.03252249e-01   2.22044605e-16   1.00000000e+00 ...,   1.00000000e+00
    8.34855435e-01   8.58578644e-01]
 [  1.00000000e+00   1.00000000e+00   0.00000000e+00 ...,   1.00000000e+00
    1.00000000e+00   1.00000000e+00]
 ..., 
 [  8.44457246e-01   1.00000000e+00   1.00000000e+00 ...,  -2.22044605e-16
    8.25922344e-01   1.00000000e+00]
 [  9.72923482e-01   8.34855435e-01   1.00000000e+00 ...,   8.25922344e-01
    1.11022302e-16   9.61075053e-01]
 [  9.53626110e-01   8.58578644e-01   1.00000000e+00 ...,   1.00000000e+00
    9.61075053e-01   2.22044605e-16]]


#### plot and compare data matrixes 

In [None]:
#:( insert code here to plot charts and some code to compare and select the best.

####  Merge dftext (MrgdEvts_w_Setmnt.csv) and selected matrix - I liked the way dtm4 looked so, im merging it.

In [61]:
#df5 = pd.concat([dftext,dtm], axis=1) - returns error -TypeError: cannot concatenate a non-NDFrame object

#### Merge Crude Oil feature

In [64]:
CrudeOil = pd.read_csv("Crude_Oil.csv")

In [65]:
pd.DataFrame(CrudeOil)

Unnamed: 0,Date,Value
0,1/4/2000,23.95
1,1/5/2000,23.72
2,1/6/2000,23.55
3,1/7/2000,23.35
4,1/10/2000,22.77
5,1/11/2000,23.93
6,1/12/2000,24.62
7,1/13/2000,24.9
8,1/14/2000,25.5
9,1/17/2000,25.99


In [73]:
addedCOil = pd.merge(dftext, CrudeOil, on='Date', how='right', indicator=False)
addedCOil.sort_values(by='Date')


Unnamed: 0.2,Unnamed: 0,EventID,Date,EventText,FilterWords,FinalRating,Unnamed: 0.1,publication,title,length,publicationtype,text,year,month,day,PositiveCount,NegativeCount,tone,_merge,Value
259,,,1/1/2001,,,,,,,,,,,,,,,,,.
520,,,1/1/2002,,,,,,,,,,,,,,,,,.
781,,,1/1/2003,,,,,,,,,,,,,,,,,.
1042,,,1/1/2004,,,,,,,,,,,,,,,,,.
1824,,,1/1/2007,,,,,,,,,,,,,,,,,.
2085,,,1/1/2008,,,,,,,,,,,,,,,,,.
2347,,,1/1/2009,,,,,,,,,,,,,,,,,.
2608,,,1/1/2010,,,,,,,,,,,,,,,,,.
3390,,,1/1/2013,,,,,,,,,,,,,,,,,.
3651,,,1/1/2014,,,,,,,,,,,,,,,,,.


In [81]:
left = pd.DataFrame(CrudeOil)

right = pd.DataFrame(dftext)

result = left.join(right,on='Date', how='outer',lsuffix='_left', rsuffix='_right')

In [82]:
result

Unnamed: 0.2,Date,Date_left,Value,Unnamed: 0,EventID,Date_right,EventText,FilterWords,FinalRating,Unnamed: 0.1,...,length,publicationtype,text,year,month,day,PositiveCount,NegativeCount,tone,_merge
0,1/4/2000,1/4/2000,23.95,,,,,,,,...,,,,,,,,,,
1,1/5/2000,1/5/2000,23.72,,,,,,,,...,,,,,,,,,,
2,1/6/2000,1/6/2000,23.55,,,,,,,,...,,,,,,,,,,
3,1/7/2000,1/7/2000,23.35,,,,,,,,...,,,,,,,,,,
4,1/10/2000,1/10/2000,22.77,,,,,,,,...,,,,,,,,,,
5,1/11/2000,1/11/2000,23.93,,,,,,,,...,,,,,,,,,,
6,1/12/2000,1/12/2000,24.62,,,,,,,,...,,,,,,,,,,
7,1/13/2000,1/13/2000,24.9,,,,,,,,...,,,,,,,,,,
8,1/14/2000,1/14/2000,25.5,,,,,,,,...,,,,,,,,,,
9,1/17/2000,1/17/2000,25.99,,,,,,,,...,,,,,,,,,,
