In [22]:
import pandas as pd
import numpy as np
import sqlite3
import nltk, re, pprint
from nltk import word_tokenize

con = sqlite3.connect('worldnews.sqlite')
c = con.cursor()


df=pd.read_sql("SELECT * FROM worldnews1",con)


#sorting according to post performance
#df.sort('score',ascending=False,inplace=True)
#df.reset_index(drop=True,inplace=True)

data = df.body

def getSubFrameByKeyword(dataframe,keyword):
    drop = np.array([i for i in range(len(dataframe.body)) if(not keyword in data[i])])
    newDf = dataframe.drop(drop)
    newDf.reset_index(drop=True,inplace=True)#resets index
    return newDf

#breaking down dataset
df_China = getSubFrameByKeyword(df,'China')
print len(df_China)
df_China.head()
df_China.head()

7605


Unnamed: 0,body,score
0,Why don't the southern nations claim it? Cmon...,-1
1,&gt;Conservation groups and scientists worry t...,-8
2,"&gt; and a strong, nationalistic anti-Japan tr...",1
3,No report on how much Antarctic krill China cu...,54
4,Thanks China!! I guess it wasnt enough to kill...,-10


# Defining attribute generation methods

In [5]:
#defining methods for attribute generation
from textblob import TextBlob
import textblob

'''
    In generating attributes, we assume that performance (the post's score on
    Reddit) will depend on what is said AND how the author of the post says it.

    We have developed attributes which characterize how an arguement is made:
     - What is the general tone of the post?
     - How subjective is it overall?
     - Does the author consider alternative points or arguements (which might register
       as a change in polarity of sentiment for a sentence within the post)?
     - Does the author provide both fact and opinion (high range in subjectivity)?
     - How terse is the prose?
     - How large are the non-trivial (not the, in, etc.) words in the post?

    However, we also want to characterize sentiment with repect to the subject
    of the post. This requires us to identify whether the targetString is a subject
    or simply appears within the post. We identify the sentences in which the
    targetString is the subject and conduct further analysis on these individual
    sentences. In instances where the targetString is the subject of several sentences,
    we calculate agregate metrics, similar to the analysis used for overall tone.
    We will refer to this analysis as "target-specific":
     - What is the polarity and subjectivity of the sentences in which the targetString
       is referenced? (If the targetString is the subject of multiple sentences, an
       average value is reported)
    
    *It is worth noting that this analysis cannot detect sarcasm or humor. This is a
    significant flaw in this analysis, but it would require cutting edge NLP--beyond
    the scope of this project.
    
    This script also generates a list of adjectives used to directly descibe the
    targetString. This list will be used generate a word-cloud during post-analysis.
'''

### Defining how-it-is-said analysis methods ###

#calculates the polarity and subjectivity of text
def getSentiment(text):
    pol = TextBlob(text).sentiment.polarity
    sub = TextBlob(text).sentiment.subjectivity
    return [pol, sub] #returning scores for polarity and subjectivity

#returns a list of sentences in text
def getSentences(text):
    return TextBlob(text).sentences

#returns average number of words per sentence
def getTerseness(text):
    sentences = getSentences(text)
    sent = np.zeros(len(sentences))
    for i in range(len(sentences)):
        #token = word_tokenize(str(sentences[i]))
        sent[i] = len(sentences[i].words)
    return sent.mean()

#returns word count
def getWordCount(text):
    return len(TextBlob(text).words)

#count number of big words (> lenLim characters)
#and normalize to total number of words
def howPretentious(text,lenLim):
    text=TextBlob(text)
    counter = 0
    realWordCounter = 0
    for i in text.words:
        if len(i) > lenLim:
            counter += 1
        #this excludes most stop words without having to
        #pull up this list of stop words.
        if len(i) > 3:# excluding most stop-words
            realWordCounter += 1
    return float(counter)/realWordCounter

#this function returns array of polarity and subjectivity corresponding
#to each sentence in the array of sentences input
#[[polarity, .... ],
# [subjectivity ...]]
def sentenceToSentArray(sentences):
    sent = np.zeros((2,len(sentences)))#pol,sub
    for i in range(len(sentences)):
        sent[0][i] = sentences[i].sentiment.polarity
        sent[1][i] = sentences[i].sentiment.subjectivity
    return sent

#returns the sample stdev in sentence sentiment
def sentStd(text):
    sentences = getSentences(text)
    if len(sentences)<=1:
        sentences=str(sentences[0]).split(',')#split by commas
        if len(sentences)<=1:#if we cannot consider separate clauses, return 0 for range
            return [0, 0]
        for i in range(len(sentences)):
            sentences[i] = TextBlob(sentences[i])
    sent = sentenceToSentArray(sentences)
    return [np.std(sent[0],ddof=1),np.std(sent[1],ddof=1)]

#returns the difference between max and min polarity and subjectivity
def sentRange(text):
    sentences = getSentences(text)
    if len(sentences)<=1:
        sentences=str(sentences[0]).split(',')#split by commas
        if len(sentences)<=1:#if we cannot consider separate clauses, return 0 for range
            return [0, 0]
        for i in range(len(sentences)):
            sentences[i] = TextBlob(sentences[i])
    sent = sentenceToSentArray(sentences)
    return [np.ptp(sent[0]),np.ptp(sent[1])]


### Defining what-is-said analysis methods ###

#returns corresponding list of subjects for each sentence in a list of sentences
def getSubjects(text,targetString):
    #array determines if the targetString is the subject of one of the
    #sentences in the post.
    sentences = getSentences(text)
    isSub = [False for i in range(len(sentences))]
    adjList=[]
    
    #looking for phrases with adjective or verb directly attached to the noun
    #if both occur, both are captured within the phrase.
    #This indicates that the noun of interest is either being described
    #or an action of the noun is being discussed.
    #We also remove nouns which are preceeded by prepositions, since this reduces
    #the likelihood that the noun is the subject of a post.
    grammar = r"""NP: {<J.*><IN>?<N.*><V.*>?<J.*>?}
                    {<J.*>?<IN>?<N.*><V.*><J.*>?}
                    }<J.*>?<IN><N.*><V.*>?<J.*>?{"""
    sentences  = [nltk.word_tokenize(str(sent)) for sent in sentences]
    sentences = [nltk.pos_tag(sent) for sent in sentences]#using a part-of-speech tagger
    #print sentences
    for i in range(len(sentences)):
        #using noun phrase chunking
        cp = nltk.RegexpParser(grammar)
        result = cp.parse(sentences[i])
        #processing Noun Phrase subtrees
        #result.draw()
        for subtree in result:
            if isinstance(subtree, nltk.tree.Tree):
                for j in subtree.subtrees():
                    for k in j:
                        if targetString in k:
                            isSub[i] = True
                            
                            #store adjectives for word cloud
                            if '/JJ' in str(j):
                                jList = str(j).split()
                                for l in range(len(jList)):
                                    if '/JJ' in jList[l]:
                                        adj = jList[l].split('/JJ')[0]
                                        if len(adj) > 2:
                                            adjList.append(adj)            
    return [isSub, adjList]

def getTargetSentiment(text,isSub):
    sentences = getSentences(text)
    this_trueCounter=0
    for i in range(len(isSub)):
        if isSub[i]:
            this_trueCounter+=1
    targetSent = np.zeros((2,this_trueCounter))
    this_trueCounter=0
    for i in range(len(sentences)):
        if isSub[i]:
            this_sent = getSentiment(str(sentences[i]))
            
            targetSent[0][this_trueCounter] = this_sent[0]
            targetSent[1][this_trueCounter] = this_sent[1]
            this_trueCounter+=1
    return [targetSent[0].mean(), targetSent[1].mean()]

def generateSeries(df,index,isSub,colNames):
    this_body = df.body[index].encode('ascii','ignore')
    this_score = df.score[index]
    [this_pol, this_sub] = getSentiment(this_body)
    [this_stdPol, this_stdSub] = sentStd(this_body)
    [this_polRange, this_subRange] = sentRange(this_body)
    this_wordCount = getWordCount(this_body)
    this_bigWords = howPretentious(this_body,10)#flagging words > 10 (relative to > 3 chars)
    this_sentLen = getTerseness(this_body)
    [this_targetPol, this_targetSub] = getTargetSentiment(this_body,isSub)
    #return pd.Series([[this_body, this_score, this_pol, this_stdPol, this_sub, this_stdSub,
    #          this_polRange, this_subRange, this_wordCount, this_bigWords, this_sentLen,
    #          this_targetPol, this_targetSub]])#,columns=colNames)
    return [this_body, this_score, this_pol, this_stdPol, this_sub, this_stdSub,
              this_polRange, this_subRange, this_wordCount, this_bigWords, this_sentLen,
              this_targetPol, this_targetSub]


#standard across all data
columnNames=['body','score','overallPol','stdPol','overallSub','stdSub','polRange','subRange',
             'wordCount','bigWords','sentLen','targetPol','targetSub']


#given a df (example: df_China) and the keyword, returns attributes DataFrame and list of
#adjectives appearing in noun-phrase containing the keyword
def generateData(df,keyword):
    data = df.body
    trueCounter=0

    #finding number of subects to preallocate array
    for z in range(len(data)):
        hasSub, this_adj=getSubjects(data[z].encode('ascii','ignore'),keyword)

        if True in hasSub:
            trueCounter+=1

    print trueCounter   

    prealoc = range(trueCounter)
    about_df = pd.DataFrame(index=prealoc,columns=columnNames)

    trueCounter=0
    adj_List=[]

    #generating aboutChina DataFrame with attributes
    for z in range(len(data)):
        hasSub, this_adj = getSubjects(data[z].encode('ascii','ignore'),keyword)
        adj_List+=this_adj
        if True in hasSub:
            #run analysis
            about_df.loc[trueCounter] = generateSeries(df,z,hasSub,columnNames)
            trueCounter+=1
    about_df = pd.DataFrame(about_df,columns=columnNames)
    print trueCounter
    
    return [about_df, adj_List]

# Generating World New Attributes Data

In [6]:
# Generate China dataframe and word list
aboutChina_df, adj_China = generateData(df_China,'China')
aboutChina_df

2452
2452


Unnamed: 0,body,score,overallPol,stdPol,overallSub,stdSub,polRange,subRange,wordCount,bigWords,sentLen,targetPol,targetSub
0,Why don't the southern nations claim it? Cmon...,-1,-0.0625,0.08838835,0.1875,0.265165,0.125,0.375,15,0,7.5,-0.125,0.375
1,"China invaded them back in '79, pulled out aft...",5,0.05,0.0505525,0.3571429,0.2302173,0.1,0.5,74,0,14.8,0,0
2,The dominant religion in Russia is Christianit...,0,0.3666667,0.244949,0.5888889,0.4040306,0.5,0.8333333,53,0.2121212,13.25,0,0
3,"Come on now, you're not seriously implying tha...",1,0.1809524,0.1304753,0.497619,0.3665359,0.3,1,82,0.06521739,11.71429,0,0
4,"""Amouny currently harvested"". How do we know h...",7,0.08888889,0.1807295,0.4044444,0.2416507,0.5555556,0.6333333,51,0,7.285714,0.3,0.4166667
5,Isn't this L'il Lisa's Ocean slurry? If so...s...,2,0,0,0,0,0,0,16,0,8,0,0
6,"Don't forget to ""what the fuck"" the other coun...",9,-0.1875,0.1087332,0.4958333,0.3184435,0.2,0.6666667,43,0,43,-0.1875,0.4958333
7,China drives the demand for rhino/elephant kil...,-3,0.2857143,0,0.5357143,0,0,0,13,0.2222222,13,0.2857143,0.5357143
8,"China isn't the worst, its just so big that un...",16,-0.1357143,0.3818377,0.5738095,0.0730677,0.54,0.1033333,39,0.08695652,19.5,-0.29,0.6033333
9,"I don't know where they're coming from, though...",4,0.08285714,0.1161309,0.4857143,0.2813959,0.2214286,0.5428571,72,0.02439024,24,-0.05,0.4


In [7]:
#store China data
pickle.dump(aboutChina_df,open('processedData/chinaData.pickle','wt'))
pickle.dump(adj_China,open('processedData/chinaAdj.pickle','wt'))
aboutChina_df.targetPol.mean()

0.057329545006661306

In [8]:
#creating America dataframe
df_America = getSubFrameByKeyword(df,'America')
print len(df_America)
df_America.head()

16912


Unnamed: 0,body,score
0,&gt; You'll also have to tell me how recently ...,0
1,"&gt; and a strong, nationalistic anti-Japan tr...",1
2,This isn't just Thailand it happens a lot in S...,1
3,You can't say American society rejected it whe...,2
4,I remember American rock and roll being export...,1


In [9]:
# Generate America dataframe and word list
aboutAmerica_df, adj_America = generateData(df_America,'America')
aboutAmerica_df

1704
1704


Unnamed: 0,body,score,overallPol,stdPol,overallSub,stdSub,polRange,subRange,wordCount,bigWords,sentLen,targetPol,targetSub
0,Still better than the european powers. Esp if ...,1,-0.06973684,0.3682781,0.4625,0.3239268,1.55,1,300,0.05172414,18.75,-0.2083333,0.4777778
1,&gt; Germany's quite a 'cranky' society really...,1,0.09966128,0.2184391,0.4089774,0.2082433,1.1,0.75,750,0.04888889,26.78571,0.2125,0.6875
2,"Yeah, because they have no way of fending off ...",1,0.05833333,0.0412479,0.4583333,0.3240906,0.05833333,0.4583333,31,0.04545455,15.5,0.05833333,0.4583333
3,The difference being America doesn't seize Chi...,3,0,0,0,0,0,0,15,0,15,0,0
4,America is always at fault. Always. No matter ...,-1,0,0,0.5,0.25,0,0.5,18,0,4.5,0,0
5,&gt; However misguided America has been in its...,0,0.1400595,0.3586426,0.5387132,0.3214782,1.8,1,534,0.04320988,17.22581,0.1333333,0.3555556
6,Israel has 400 people/km^2. Kuwait has 200/km^...,0,-0.02800325,0.3188251,0.4818182,0.3023985,1.214286,1,281,0.04166667,18.73333,0.015,0.3885714
7,Everyone saying these dams are the best soluti...,3,0.1688194,0.2455363,0.5938591,0.3102194,0.8388889,1,334,0.04591837,22.26667,0.5,0.5
8,&gt; America shooting down an Iranian civilian...,25,0.01349206,0.1768591,0.4984127,0.3791686,0.5833333,1,142,0.05063291,15.77778,-0.07777778,0.1444444
9,Then America destroys your minelayers and mine...,2,-0.125,0.1154701,0.5125,0.316557,0.2,0.625,22,0.08333333,7.333333,0,0.625


In [10]:
#store America data
pickle.dump(aboutAmerica_df,open('processedData/americaData.pickle','wt'))
pickle.dump(adj_America,open('processedData/americaAdj.pickle','wt'))
aboutAmerica_df.targetPol.mean()

0.039699941901693021

In [11]:
#creating ISIS dataframe
df_ISIS = getSubFrameByKeyword(df,'ISIS')
print len(df_ISIS)
df_ISIS.head()

6992


Unnamed: 0,body,score
0,I'd agree with you if the area was stable and ...,3
1,I agree with u/DecisiveMind. If the past decad...,1
2,&gt; The U.S. countered a Soviet Proposal...\n...,0
3,"10 years ago 19 Hijackers were Saudis, today S...",26
4,i think that Hezbollah's position in Lebanon i...,1


In [12]:
# Generate ISIS dataframe and word list
aboutISIS_df, adj_ISIS = generateData(df_ISIS,'ISIS')
aboutISIS_df

2254
2254


Unnamed: 0,body,score,overallPol,stdPol,overallSub,stdSub,polRange,subRange,wordCount,bigWords,sentLen,targetPol,targetSub
0,"10 years ago 19 Hijackers were Saudis, today S...",26,0.03333333,0.1133303,0.3486111,0.1924914,0.2625,0.44375,68,0.02325581,13.6,-0.0125,0.44375
1,I did say they are a lot \n\n&gt;are taught th...,-2,0.02307692,0.2216094,0.4288462,0.2425729,0.7833333,0.625,145,0.06024096,16.11111,-0.0125,0.44375
2,Israel has 400 people/km^2. Kuwait has 200/km^...,0,-0.02800325,0.3188251,0.4818182,0.3023985,1.214286,1,281,0.04166667,18.73333,-0.02,0.52
3,wow so much anti israeli hate here. Can you je...,1,-0.1209524,0.2310465,0.4990476,0.3358365,0.6428571,1,139,0.03614458,13.9,-0.5,0.5625
4,"Yep, ISIS are some cold blooded bastards, but ...",-1,-0.2,0.1414214,0.8,0.5656854,0.2,0.8,43,0.03448276,21.5,-0.2,0.8
5,SA bought and have a lot of weapons...\n\nI do...,1,-0.6,0.4242641,0.9,0.6363961,0.6,0.9,23,0,11.5,-0.6,0.9
6,"You are missing the point, they don't live nor...",1,0.1337662,0.3021274,0.3928571,0.08838835,0.4272727,0.125,49,0.08823529,24.5,0.07272727,0.375
7,ISIS is a direct spawn of KSA's spread of wahh...,2,0.1,0,0.4,0,0,0,11,0,11,0.1,0.4
8,"Western inaction? That's hilarious, the US has...",1,0.05106421,0.2534825,0.4480249,0.3401563,1.5,1,705,0.1193059,28.2,0.18,0.5
9,It's old news that JAN has been part of the re...,19,0.2361111,0.2457352,0.4694444,0.3213418,0.6555556,0.8888889,100,0.01666667,12.5,-0.1555556,0.2888889


In [13]:
#store ISIS data
pickle.dump(aboutISIS_df,open('processedData/ISIS_Data.pickle','wt'))
pickle.dump(adj_ISIS,open('processedData/ISIS_Adj.pickle','wt'))
aboutISIS_df.targetPol.mean()

0.018691152333952056

# Now switching to politics data generation

In [23]:
con.close()
con = sqlite3.connect('politics.sqlite')
c = con.cursor()


df=pd.read_sql("SELECT * FROM politics1",con)
data = df.body
def getSubFrameByKeyword(dataframe,keyword):
    drop = np.array([i for i in range(len(dataframe.body)) if(not keyword in data[i])])
    newDf = dataframe.drop(drop)
    newDf.reset_index(drop=True,inplace=True)#resets index
    return newDf

In [24]:
#creating Sanders dataframe
df_Sanders = getSubFrameByKeyword(df,'Sanders')
print len(df_Sanders)
df_Sanders.head()

7678


Unnamed: 0,body,score
0,"Ah, Ninjew, we feel the same.\n\nWait! Nin*jew...",1
1,I mean I suppose we'll find out. I just have n...,1
2,But they agree on the most pressing issues whi...,2
3,&gt;Sorry. I was sidetracked by hilarious imag...,2
4,"No, I don't think you quite understand what I'...",1


In [25]:
# Generate Sanders dataframe and word list
aboutSanders_df, adj_Sanders = generateData(df_Sanders,'Sanders')
aboutSanders_df

2836
2836


Unnamed: 0,body,score,overallPol,stdPol,overallSub,stdSub,polRange,subRange,wordCount,bigWords,sentLen,targetPol,targetSub
0,"Ah, Ninjew, we feel the same.\n\nWait! Nin*jew...",1,0.02272727,0.2898231,0.5541667,0.4504297,1,1,60,0.1282051,8.571429,0.5,1
1,I mean I suppose we'll find out. I just have n...,1,-0.053125,0.4402296,0.503125,0.1404976,0.8125,0.275,39,0,13,0.5,0.5
2,But they agree on the most pressing issues whi...,2,0.425,0.2565801,0.4,0.2516611,0.5,0.5,36,0.1363636,12,0.35,0.3
3,&gt;Sorry. I was sidetracked by hilarious imag...,2,0.4166667,0.2393568,0.962963,0.4843221,0.5,1,36,0.1,9,0.5,1
4,"To be fair, Al Gore had the charisma of a dead...",1,0.3285714,0.07778175,0.6428571,0.007071068,0.11,0.01,34,0,17,0.36,0.64
5,Could we just have Biden again? Is that possib...,2,0.2,0.1732051,0.7333333,0.5033223,0.3,1,22,0,7.333333,0.3,0.6
6,Sanders is running in the Democratic *primary*...,4,0.16,0.1414214,0.34,0.1900349,0.2,0.26875,46,0.15,46,0.16,0.34
7,"Okay, so confirmed: You don't know anything ab...",3,0.1612422,0.2476897,0.3703416,0.2859949,0.8133333,0.9533333,170,0.01785714,14.16667,0.09444444,0.2577778
8,Bernie Sanders is a conspiratard. The idea tha...,-15,0.2435714,0.3635703,0.4638095,0.3147068,1.080357,0.9,153,0.03296703,15.3,-0.04017857,0.3446429
9,Do you think that Ted Cruz has a chance of win...,0,0.45,0.3181981,0.625,0.4419417,0.45,0.625,19,0,9.5,0,0


In [26]:
#store Sanders data
pickle.dump(aboutSanders_df,open('processedData/SandersData.pickle','wt'))
pickle.dump(adj_Sanders,open('processedData/SandersAdj.pickle','wt'))
aboutSanders_df.targetPol.mean()

0.10317065556763513

In [27]:
#creating Obama dataframe
df_Obama = getSubFrameByKeyword(df,'Obama')
print len(df_Obama)
df_Obama.head()

7619


Unnamed: 0,body,score
0,I mean I suppose we'll find out. I just have n...,1
1,This. If a third party received enough votes t...,1
2,There's no danger of that happening since Bara...,4
3,&gt;Add that they consistently pick losers and...,4
4,That's true but I think as a libertarian one m...,1


In [28]:
# Generate Obama dataframe and word list
aboutObama_df, adj_Obama = generateData(df_Obama,'Obama')
aboutObama_df

2703
2703


Unnamed: 0,body,score,overallPol,stdPol,overallSub,stdSub,polRange,subRange,wordCount,bigWords,sentLen,targetPol,targetSub
0,I mean I suppose we'll find out. I just have n...,1,-0.053125,0.4402296,0.503125,0.1404976,0.8125,0.275,39,0,13,-0.2,0.4125
1,There's no danger of that happening since Bara...,4,0.3666667,0.2516611,0.6666667,0.3818813,0.5,0.75,57,0.05882353,19,0.4,0.625
2,&gt;Add that they consistently pick losers and...,4,0.2833333,0.5480078,0.2833333,0.1237437,0.775,0.175,22,0.07692308,11,0.8,0.4
3,Do these idiots have anything serious to run o...,37,-0.4895833,0,0.6270833,0,0,0,14,0,14,-0.4895833,0.6270833
4,"Okay, so confirmed: You don't know anything ab...",3,0.1612422,0.2476897,0.3703416,0.2859949,0.8133333,0.9533333,170,0.01785714,14.16667,-0.1071429,0.5714286
5,Democrats voting against Hillary for a third p...,0,0.2204167,0.1888784,0.3254167,0.3341027,0.4544444,0.9,131,0.04444444,14.55556,0,0
6,How did they change the rules to stop a third ...,1,0.07069597,0.06156715,0.2533883,0.2089161,0.15,0.525,86,0.05769231,14.33333,0,0
7,Of course he does! Perhaps you've noticed Hill...,1,-0.04097222,0.09696699,0.4834201,0.2125309,0.2464286,0.5285714,107,0.01666667,21.4,0.125,0.4583333
8,"&gt;""He's made decisions that I think have inf...",2,0.1222222,0.09166667,0.4222222,0.2967708,0.1833333,0.6083333,121,0.1234568,30.25,0.1833333,0.6083333
9,I don't think she would have a shot regardless...,1,0.07727273,0.06701442,0.4709091,0.3224141,0.1363636,0.6333333,63,0.06666667,15.75,0,0


In [29]:
#store Obama data
pickle.dump(aboutObama_df,open('processedData/ObamaData.pickle','wt'))
pickle.dump(adj_Obama,open('processedData/ObamaAdj.pickle','wt'))
aboutObama_df.targetPol.mean()

0.0581801943371772

In [30]:
#creating Republicans dataframe
df_Rep = getSubFrameByKeyword(df,'Republicans')
print len(df_Rep)
df_Rep.head()

4689


Unnamed: 0,body,score
0,This. If a third party received enough votes t...,1
1,But the media is going to tell you that Republ...,-2
2,"""Unfortunately, House Republicans refused to g...",1
3,&gt;a statewide poll in Louisiana found that n...,181
4,"If he makes the primaries, and there is a good...",1


In [31]:
# Generate Republicans dataframe and word list
aboutRep_df, adj_Rep = generateData(df_Rep,'Republicans')
aboutRep_df

2192
2192


Unnamed: 0,body,score,overallPol,stdPol,overallSub,stdSub,polRange,subRange,wordCount,bigWords,sentLen,targetPol,targetSub
0,This. If a third party received enough votes t...,1,0.1125,0.2285598,0.2347222,0.1547661,0.525,0.375,70,0.025,17.5,-0.125,0.375
1,"""Unfortunately, House Republicans refused to g...",1,0.02026144,0.246675,0.4748366,0.3202845,0.9666667,1,368,0.1087866,26.28571,-0.5,1
2,"If he makes the primaries, and there is a good...",1,0.1501299,0.1973795,0.4051948,0.240372,0.35,0.4753247,79,0.02272727,26.33333,0,0
3,It has but they still like to throw him out th...,1,0.2857143,0,0.5357143,0,0,0,18,0.1,18,0.2857143,0.5357143
4,I dont think it's as mindless of a decision as...,1,0.1683333,0.07676322,0.4066667,0.2149505,0.15,0.4166667,83,0.04761905,27.66667,0.1466667,0.2333333
5,You guys are a trip. \n\nRepublicans have been...,1,-0.5,0.2886751,0.5625,0.3247595,0.5,0.5625,30,0.05555556,10,0,0
6,"Candadian here, so I mostly follow out of inte...",2,0.475,0.03125,0.55,0.0625,0.0625,0.125,63,0.02702703,15.75,0.5,0.5
7,Fair enough. I respect the passion and drive o...,1,-0.153,0.4501567,0.605,0.393059,1.5,1,125,0.08450704,10.41667,-0.5375,0.6458333
8,I think low voter turnout is due to people thi...,5,-0.078125,0.07365696,0.6375,0.3417683,0.1041667,0.4833333,45,0.07692308,22.5,-0.1041667,0.5166667
9,Well of course he does.\n\nObama is a Democrat...,29,0.04794408,0.1796433,0.2671053,0.2336044,0.75,0.6666667,232,0.02877698,16.57143,-0.04861111,0.1236111


In [32]:
#store Republicans data
pickle.dump(aboutRep_df,open('processedData/RepublicansData.pickle','wt'))
pickle.dump(adj_Rep,open('processedData/RepublicansAdj.pickle','wt'))
aboutRep_df.targetPol.mean()

0.042080869891589885

# Switching to technology analysis

In [33]:
con.close()
con = sqlite3.connect('technology.sqlite')
c = con.cursor()


df=pd.read_sql("SELECT * FROM technology1",con)
data = df.body
def getSubFrameByKeyword(dataframe,keyword):
    drop = np.array([i for i in range(len(dataframe.body)) if(not keyword in data[i])])
    newDf = dataframe.drop(drop)
    newDf.reset_index(drop=True,inplace=True)#resets index
    return newDf

In [34]:
#creating Apple dataframe
df_Apple = getSubFrameByKeyword(df,'Apple')
print len(df_Apple)
df_Apple.head()

2020


Unnamed: 0,body,score
0,I'm assuming that's just raw material cost...d...,14
1,Record breaking revenues and profits and Apple...,3
2,Not sure if it's a real mistake. App develope...,1
3,"This is the best tl;dr I could make, [original...",2
4,Rule of thumb is a 3-4x multiplier. That is li...,10


In [35]:
# Generate Apple dataframe and word list
aboutApple_df, adj_Apple = generateData(df_Apple,'Apple')
aboutApple_df

716
716


Unnamed: 0,body,score,overallPol,stdPol,overallSub,stdSub,polRange,subRange,wordCount,bigWords,sentLen,targetPol,targetSub
0,Record breaking revenues and profits and Apple...,3,0.5,0.3535534,0.8888889,0.6285394,0.5,0.8888889,12,0,6,0,0
1,But if we did that it would make Apple look le...,25,-0.4333333,0.3064129,0.3666667,0.2592725,0.4333333,0.3666667,24,0,24,-0.4333333,0.3666667
2,To everyone saying that this doesn't take into...,-1,0.2559524,0.2058186,0.5392857,0.1477348,0.2910714,0.2089286,95,0.06666667,47.5,0.45,0.4
3,&gt; this is still a really good representatio...,5,0.3142857,0.2753785,0.4619048,0.1028528,0.55,0.2,66,0.06060606,22,0.45,0.4
4,"This is the best tl;dr I could make, [original...",3,0.3483601,0.348198,0.5692675,0.2682139,0.9,0.7,232,0.1319444,33.14286,0.2098485,0.4356061
5,You do realize that's a very vague statement r...,-1,0.1157814,0.2090643,0.4583028,0.1514349,0.4071429,0.2850794,81,0.08888889,27,0.1038889,0.3077778
6,To their other Apple products.,3,-0.125,0,0.375,0,0,0,5,0,5,-0.125,0.375
7,Apple does a lot for worker conditions and rig...,4,0.6,0.4242641,1,0.7071068,0.6,1,32,0.1176471,16,0,0
8,Apple is close to the largest company in the w...,0,0.2,0.1154701,0.2,0.1154701,0.2,0.2,39,0,13,0.1,0.1
9,What about in laptops/desktop PCs? It's a very...,3,0.13,0.07505553,0.13,0.07505553,0.13,0.13,27,0.08333333,9,0.13,0.13


In [36]:
#store Apple data
pickle.dump(aboutApple_df,open('processedData/AppleData.pickle','wt'))
pickle.dump(adj_Apple,open('processedData/AppleAdj.pickle','wt'))
aboutApple_df.targetPol.mean()

0.099596692391123232

In [37]:
#creating Windows dataframe
df_Windows = getSubFrameByKeyword(df,'Windows')
print len(df_Windows)
df_Windows.head()

1354


Unnamed: 0,body,score
0,Motorola is the one of the few manufacturers t...,1
1,&gt; But here you go with how Google is someho...,1
2,That's not true though. The operating systems...,1
3,"This is the best tl;dr I could make, [original...",1
4,Which is horrible to work with as a technician...,1


In [38]:
# Generate Windows dataframe and word list
aboutWindows_df, adj_Windows = generateData(df_Windows,'Windows')
aboutWindows_df

214
214


Unnamed: 0,body,score,overallPol,stdPol,overallSub,stdSub,polRange,subRange,wordCount,bigWords,sentLen,targetPol,targetSub
0,"Exactly. IMO, this is throwing in the towel ...",1,-0.05,0.1689839,0.3569444,0.2028786,0.475,0.4583333,48,0.137931,9.6,0,0
1,"This is the best tl;dr I could make, [original...",1,0.335,0.3645366,0.5783333,0.2559801,0.9,0.7,175,0.13,25,0.1625,0.4708333
2,I haven't been able to do anything useful in t...,1,0.2292798,0.2688978,0.4991145,0.3019752,0.6746753,0.7678571,83,0.02857143,13.83333,0.3,0.35
3,Only Windows 7 and up are qualified for the fr...,1,0.35,0.2516611,0.525,0.4821825,0.5,0.9,86,0.02083333,28.66667,0.2,0.9
4,"This is the best tl;dr I could make, [original...",0,0.2363492,0.3679178,0.4031746,0.3039707,0.9863946,0.7693878,221,0.1147541,31.57143,0.01360544,0.2306122
5,The application is called CCF (Control Centre ...,23,0.02852564,0.30911,0.3967949,0.3197077,1.55,0.95,299,0.1017964,17.58824,0,0.5625
6,I had someone argue repeatedly that if you eve...,38,-0.06455026,0.199623,0.3643671,0.3577011,0.55,0.9,94,0,18.8,-0.06666667,0.09230769
7,"Free Windows, or advertising. Pick one.",-11,0.4,0.2828427,0.8,0.5656854,0.4,0.8,6,0.25,3,0.4,0.8
8,Anybody who thinks Linux is safe from viruses ...,8,-0.1708333,0.3800699,0.5555556,0.1252168,0.5375,0.1770833,43,0,21.5,-0.35,0.6145833
9,Unfortunately I don't leave my home computer o...,15,0.09583333,0.2801367,0.6666667,0.2359323,0.65,0.5625,53,0.03448276,13.25,0,1


In [39]:
#store Windows data
pickle.dump(aboutWindows_df,open('processedData/WindowsData.pickle','wt'))
pickle.dump(adj_Windows,open('processedData/WindowsAdj.pickle','wt'))
aboutWindows_df.targetPol.mean()

0.11029302089868956

In [40]:
#creating Tesla dataframe
df_Tesla = getSubFrameByKeyword(df,'Tesla')
print len(df_Tesla)
df_Tesla.head()

1917


Unnamed: 0,body,score
0,I'm extremely surprised by what is in this ann...,312
1,What exactly is Tesla 'introducing' here? It's...,-15
2,So now a few thousand dollars will net you bei...,5
3,"This is the best tl;dr I could make, [original...",2
4,"Also ""Tesla’s selling price to installers is $...",10


In [41]:
# Generate Republicans dataframe and word list
aboutTesla_df, adj_Tesla = generateData(df_Tesla,'Tesla')
aboutTesla_df

615
615


Unnamed: 0,body,score,overallPol,stdPol,overallSub,stdSub,polRange,subRange,wordCount,bigWords,sentLen,targetPol,targetSub
0,I'm extremely surprised by what is in this ann...,312,0.1545455,0.1770361,0.4363636,0.3353067,0.4,0.9,114,0.08974359,16.28571,0.0375,0.325
1,What exactly is Tesla 'introducing' here? It's...,-15,0.25,0.1767767,0.25,0.1767767,0.25,0.25,31,0.0625,15.5,0.25,0.25
2,So now a few thousand dollars will net you bei...,5,0.168254,0.3713084,0.4848765,0.3011392,0.8,0.7319444,123,0.09859155,30.75,0.1071429,0.5625
3,"This is the best tl;dr I could make, [original...",2,0.2771605,0.4354661,0.7070988,0.2986499,1.155556,0.7111111,184,0.1619048,26.28571,0.125,0.875
4,Using a typical Deep Cycle Lead Acid setup for...,5,0.006666667,0.1204736,0.56,0.2780138,0.2833333,0.6,69,0.04255319,17.25,0,1
5,"It's like V2G, I wouldn't be surprised if you ...",0,0.1166667,0.08249579,0.675,0.4772971,0.1166667,0.675,22,0,22,0.1166667,0.675
6,Tesla is really going to be the Google of elec...,8,0.2,0,0.2,0,0,0,11,0.3333333,11,0.2,0.2
7,"This is the best tl;dr I could make, [original...",2,0.3714286,0.4117327,0.6571429,0.385604,1,1,160,0.1157895,22.85714,0.1,0.45
8,I'm just saying Tesla isn't making them disape...,-7,0,0,0,0,0,0,14,0,14,0,0
9,This is what I don't understand. Tesla has pol...,2,0.45,0.3181981,0.475,0.3358757,0.45,0.475,29,0,14.5,0.45,0.475


In [42]:
#store Tesla data
pickle.dump(aboutTesla_df,open('processedData/TeslaData.pickle','wt'))
pickle.dump(adj_Tesla,open('processedData/TeslaAdj.pickle','wt'))
aboutTesla_df.targetPol.mean()

0.092510600732450377

In [None]:
'''
Good references for understanding the code/NLP in general:
http://billchambers.me/tutorials/2015/01/14/python-nlp-cheatsheet-nltk-scikit-learn.html
http://www.nltk.org/book/
'''