In [2]:
import pandas as pd
import numpy as np
import random
import csv
raw_data = pd.read_csv('train-v2.tsv', sep="\t", header=None, quoting = csv.QUOTE_NONE)
raw_data.columns = ['label', 'tweet']
raw_data = raw_data[pd.notnull(raw_data['tweet'])]
raw_data.head()

Unnamed: 0,label,tweet
0,0,@USER @USER a sicrhau bod mwy o arian poced 'd...
1,1,Parti Dolig da gyda tim swyddfa canolog @USER ...
2,0,@USER yeaah ma fe yn wir. ( oh well.
3,1,@USER hahaha idk. 3am oedd y bws ti?
4,0,@USER dwim yn gal llun ohoni?


In [3]:
print(raw_data.shape)
raw_data.head(100)

(80000, 2)


Unnamed: 0,label,tweet
0,0,@USER @USER a sicrhau bod mwy o arian poced 'd...
1,1,Parti Dolig da gyda tim swyddfa canolog @USER ...
2,0,@USER yeaah ma fe yn wir. ( oh well.
3,1,@USER hahaha idk. 3am oedd y bws ti?
4,0,@USER dwim yn gal llun ohoni?
...,...,...
95,1,Mae ein ceidwad Jim wedi cymryd lluniau hyfryd...
96,0,Amserlen @USER a'r plant heddiw:@USER (sesiwn ...
97,1,@USER dim problem. Diolch am y gusan Aled
98,1,@USER Ffab. Gweld ti prawn Dydd Sul y 20th de ...


In [4]:
# Appears to be an almost perfectly balanced dataset.  Our "priors" are 50/50
print(raw_data[raw_data['label']==0].shape[0])
print(raw_data[raw_data['label']==1].shape[0])

39981
40019


In [5]:
# Hold back training data
training_data = raw_data.sample(frac=0.8)
testing_data = raw_data.drop(training_data.index)

In [6]:
print(training_data.shape)
print(testing_data.shape)

(64000, 2)
(16000, 2)


In [7]:
# Separate training data into positive and negative datasets for simplicity
training_data_pos = training_data[training_data['label']==1]
training_data_neg = training_data[training_data['label']==0]

In [8]:
print(training_data_neg.shape)
print(training_data_pos.shape)

(32010, 2)
(31990, 2)


In [9]:
training_data_neg.head

<bound method NDFrame.head of        label                                              tweet
3654       0  @USER ffaaaaac, sa waeth sa ti'n Mumbai na Gae...
54772      0          @USER ie Capel Mydroilyn! Hollol gutted!!
78186      0  @USER onin crio borama cys oddon brifo gymaint...
29490      0  @USER weeel dwin gwithio bob nos o nos fercher...
70375      0  Newyddion trist iawn am farwolaeth Rowena Kincaid
...      ...                                                ...
65093      0  @USER O.N. sori am beidio dod neithiwr - gum i...
10562      0  @USER paid a trio gwadu'r peth. "O na fy chino...
77460      0  ffiseg mor anodd pam nesi gymud ooo #bringondaU 👎
71776      0                      Llawn annwyd heddiw . #bleugh
40342      0  @USER na, dwi am ffonior hen leri wan i nol fi ((

[32010 rows x 2 columns]>

In [10]:
training_data_pos.head

<bound method NDFrame.head of        label                                              tweet
38925      1  @USER oooh gobitho bydd e;n gallu trefnu rhywb...
35686      1  @USER ia heddiw , dwi angan gwbod numbers so t...
50069      1  @USER oooh man! I thought odd da fi fe! Falle ...
43215      1  @USER haha snap, excuse ydi mai'n ddolig! doli...
9845       1  Lyfli chat da @USER ar y ffon! Weve doneee it ...
...      ...                                                ...
30309      1      YES!!!!! Cymru di curo 27-13 Proud o nhw xxxx
31448      1  @USER Eitha genfigenus mai rhan o job ti yw gw...
19207      1  Waw! Mae tywydd yn braf heddiw! Diolch am ein ...
49636      1  @USER @USER Waw.... Interflora!!! Da wan @USER...
53718      1  @USER ma "J" yn y wyddor ma nhw'n dysgu yn ysg...

[31990 rows x 2 columns]>

In [11]:
# We need to calculate/generate the following:
# Number of unique words in positive tweets
# Number of unique words in negative tweets

# We could make a dictionary of unique words as keys, values as number of times the word appears

In [12]:
uniquePositiveWords={}
uniqueNegativeWords={}

In [13]:
for tweet in range(len(training_data_pos)):
    currentTweet=training_data_pos.iloc[tweet]['tweet'].split()
    for word in currentTweet:
        if word in uniquePositiveWords:
            uniquePositiveWords[word]+=1
        else:
            uniquePositiveWords[word]=1
for tweet in range(len(training_data_neg)):
    currentTweet=training_data_neg.iloc[tweet]['tweet'].split()
    for word in currentTweet:
        if word in uniqueNegativeWords:
            uniqueNegativeWords[word]+=1
        else:
            uniqueNegativeWords[word]=1

In [14]:
numUniquePositiveWords=len(uniquePositiveWords)
numUniqueNegativeWords=len(uniqueNegativeWords)
print(numUniquePositiveWords)
print(numUniqueNegativeWords)

69400
67033


In [15]:
uniquePositiveWords

{'@USER': 34171,
 'oooh': 26,
 'gobitho': 60,
 'bydd': 491,
 'e;n': 1,
 'gallu': 294,
 'trefnu': 33,
 'rhywbeth': 131,
 'i': 10366,
 'ti': 3045,
 'ia': 403,
 'heddiw': 515,
 ',': 249,
 'dwi': 1551,
 'angan': 118,
 'gwbod': 310,
 'numbers': 6,
 'so': 683,
 'dod': 822,
 'ta': 169,
 'be?': 18,
 'Quick': 2,
 'trip': 47,
 'cardiff': 7,
 'man!': 14,
 'I': 1601,
 'thought': 15,
 'odd': 615,
 'da': 1413,
 'fi': 3125,
 'fe!': 22,
 'Falle': 44,
 'y': 6832,
 'box': 17,
 'bach': 750,
 'yn': 17514,
 'fach': 97,
 'am': 4554,
 '50!': 1,
 'hihi': 35,
 'xx': 1219,
 'haha': 1054,
 'snap,': 1,
 'excuse': 13,
 'ydi': 125,
 "mai'n": 27,
 'ddolig!': 2,
 'dolig': 86,
 'llawen': 56,
 'iawn': 1787,
 'chi': 1085,
 'gyd': 405,
 'joiwch!': 4,
 'welai': 132,
 'di': 3040,
 'nos': 605,
 'iau,': 5,
 'seshwwwn!': 1,
 'XXX': 14,
 'Lyfli': 27,
 'chat': 28,
 'ar': 4949,
 'ffon!': 4,
 'Weve': 1,
 'doneee': 1,
 'it': 248,
 'aaaaa': 3,
 '😚😚': 2,
 '@USER:': 22,
 '#gwenwchmaenddyddgwener': 6,
 'a': 6511,
 'mond': 145,
 'un': 

In [27]:
# Calculate word sentiment scores
for key in uniquePositiveWords:
    uniquePositiveWords[key]=(uniquePositiveWords[key]+1)/(numUniquePositiveWords+numUniqueNegativeWords)
for key in uniqueNegativeWords:
    uniqueNegativeWords[key]=(uniqueNegativeWords[key]+1)/(numUniquePositiveWords+numUniqueNegativeWords)

In [28]:
uniquePositiveWords

{'@USER': 7.329605007586153e-06,
 'oooh': 7.329605007586141e-06,
 'gobitho': 7.329605007586141e-06,
 'bydd': 7.329605007586141e-06,
 'e;n': 7.329605007586141e-06,
 'gallu': 7.329605007586141e-06,
 'trefnu': 7.329605007586141e-06,
 'rhywbeth': 7.329605007586141e-06,
 'i': 7.329605007586144e-06,
 'ti': 7.329605007586143e-06,
 'ia': 7.329605007586141e-06,
 'heddiw': 7.329605007586141e-06,
 ',': 7.329605007586141e-06,
 'dwi': 7.329605007586141e-06,
 'angan': 7.329605007586141e-06,
 'gwbod': 7.329605007586141e-06,
 'numbers': 7.329605007586141e-06,
 'so': 7.329605007586141e-06,
 'dod': 7.329605007586141e-06,
 'ta': 7.329605007586141e-06,
 'be?': 7.329605007586141e-06,
 'Quick': 7.329605007586141e-06,
 'trip': 7.329605007586141e-06,
 'cardiff': 7.329605007586141e-06,
 'man!': 7.329605007586141e-06,
 'I': 7.329605007586141e-06,
 'thought': 7.329605007586141e-06,
 'odd': 7.329605007586141e-06,
 'da': 7.329605007586141e-06,
 'fi': 7.329605007586143e-06,
 'fe!': 7.329605007586141e-06,
 'Falle': 

In [29]:
uniqueNegativeWords

{'@USER': 7.329646836066898e-06,
 'ffaaaaac,': 7.329714350820183e-06,
 'sa': 7.355956726990439e-06,
 'waeth': 7.340648674224457e-06,
 "ti'n": 7.391821307756453e-06,
 'Mumbai': 7.329714350820183e-06,
 'na': 7.533639482309871e-06,
 'Gaerdydd': 7.335618885458491e-06,
 'myn': 7.3500521923521314e-06,
 'uffach': 7.329714350820183e-06,
 'I': 7.521830413033256e-06,
 '>': 7.336931004267004e-06,
 'di': 7.774631970140041e-06,
 'bod': 7.618489831927026e-06,
 'isie': 7.3323385884372095e-06,
 'textio': 7.3344161098840215e-06,
 'ti': 7.505647614394932e-06,
 'holi': 7.330698439926568e-06,
 'sut': 7.3521297137989435e-06,
 'wyt': 7.337805750139345e-06,
 'ti,': 7.332666618139337e-06,
 'a': 8.084182665715001e-06,
 'methu,': 7.329933037288269e-06,
 'grr.': 7.329714350820183e-06,
 'Nol': 7.3344161098840215e-06,
 'nos': 7.398491245033059e-06,
 'Ferch': 7.329714350820183e-06,
 'ie': 7.33594691516062e-06,
 'Capel': 7.330151723756355e-06,
 'Mydroilyn!': 7.329714350820183e-06,
 'Hollol': 7.333103991075509e-06,
 