In [259]:
import pandas as pd
import numpy as np
import sqlite3
import nltk, re, pprint
from nltk import word_tokenize

con = sqlite3.connect('worldnews.sqlite')
c = con.cursor()


df=pd.read_sql("SELECT * FROM worldnews1",con)


#store = pd.HDFStore('store.h5')

#figure out how to deal with this, I don't think HDFStore supports unicode format
'''
counter=0
for chunk in pd.read_sql(query,con, chunksize=50000):
    store.append(str(counter),chunk)
    counter+=1

print store
'''
#sorting according to post performance
df.sort('score',ascending=False,inplace=True)
df.reset_index(drop=True,inplace=True)
data = df.body

def getSubFrameByKeyword(dataframe,keyword):
    drop = np.array([i for i in range(len(dataframe.body)) if(not keyword in data[i])])
    newDf = dataframe.drop(drop)
    newDf.reset_index(drop=True,inplace=True)#resets index
    return newDf

#breaking down dataset
df_China = getSubFrameByKeyword(df,'China')
print len(df_China)
df_China.head()
df_China.head()

7605


Unnamed: 0,body,score
0,Man who tricks people into believing he is ric...,3904
1,"""I spent 33 years and four months in active mi...",3105
2,And immediately claimed by China based on anci...,2163
3,There's zero evidence that any of the natural ...,1635
4,I think China is in the running for first coun...,1159


In [262]:
#defining methods for attribute generation
from textblob import TextBlob
import textblob

'''
    In generating attributes, we assume that performance (the post's score on
    Reddit) will depend on what is said AND how the author of the post says it.

    We have developed attributes which characterize how an arguement is made:
     - What is the general tone of the post?
     - How subjective is it overall?
     - Does the author consider alternative points or arguements (which might register
       as a change in polarity of sentiment for a sentence within the post)?
     - Does the author provide both fact and opinion (high range in subjectivity)?
     - How terse is the prose?
     - How large are the non-trivial (not the, in, etc.) words in the post?

    However, we also want to characterize sentiment with repect to the subject
    of the post. This requires us to identify whether the targetString is a subject
    or simply appears within the post. We identify the sentences in which the
    targetString is the subject and conduct further analysis on these individual
    sentences. In instances where the targetString is the subject of several sentences,
    we calculate agregate metrics, similar to the analysis used for overall tone.
    We will refer to this analysis as "target-specific":
     - What is the polarity and subjectivity of the sentences in which the targetString
       is referenced? (If the targetString is the subject of multiple sentences, an
       average value is reported)
    
    *It is worth noting that this analysis cannot detect sarcasm or humor. This is a
    significant flaw in this analysis, but it would require cutting edge NLP--beyond
    the scope of this project.
    
    This script also generates a list of adjectives used to directly descibe the
    targetString. This list will be used generate a word-cloud during post-analysis.
'''

### Defining how-it-is-said analysis methods ###

#calculates the polarity and subjectivity of text
def getSentiment(text):
    pol = TextBlob(text).sentiment.polarity
    sub = TextBlob(text).sentiment.subjectivity
    return [pol, sub] #returning scores for polarity and subjectivity

#returns a list of sentences in text
def getSentences(text):
    return TextBlob(text).sentences

#returns average number of words per sentence
def getTerseness(text):
    sentences = getSentences(text)
    sent = np.zeros(len(sentences))
    for i in range(len(sentences)):
        #token = word_tokenize(str(sentences[i]))
        sent[i] = len(sentences[i].words)
    return sent.mean()

#returns word count
def getWordCount(text):
    return len(TextBlob(text).words)

#count number of big words (> lenLim characters)
#and normalize to total number of words
def howPretentious(text,lenLim):
    text=TextBlob(text)
    counter = 0
    realWordCounter = 0
    for i in text.words:
        if len(i) > lenLim:
            counter += 1
        #this excludes most stop words without having to
        #pull up this list of stop words.
        if len(i) > 3:# excluding most stop-words
            realWordCounter += 1
    return float(counter)/realWordCounter

#this function returns array of polarity and subjectivity corresponding
#to each sentence in the array of sentences input
#[[polarity, .... ],
# [subjectivity ...]]
def sentenceToSentArray(sentences):
    sent = np.zeros((2,len(sentences)))#pol,sub
    for i in range(len(sentences)):
        sent[0][i] = sentences[i].sentiment.polarity
        sent[1][i] = sentences[i].sentiment.subjectivity
    return sent

#returns the sample stdev in sentence sentiment
def sentStd(text):
    sentences = getSentences(text)
    sent = sentenceToSentArray(sentences)
    return [np.std(sent[0],ddof=1),np.std(sent[1],ddof=1)]

#returns the difference between max and min polarity and subjectivity
def sentRange(text):
    sentences = getSentences(text)
    sent = sentenceToSentArray(sentences)
    return [np.ptp(sent[0]),np.ptp(sent[1])]


### Defining what-is-said analysis methods ###

#returns corresponding list of subjects for each sentence in a list of sentences
def getSubjects(text,targetString):
    #array determines if the targetString is the subject of one of the
    #sentences in the post.
    sentences = getSentences(text)
    isSub = [False for i in range(len(sentences))]
    
    #looking for phrases with adjective or verb directly attached to the noun
    #if both occur, both are captured within the phrase.
    #This indicates that the noun of interest is either being described
    #or an action of the noun is being discussed.
    #We also remove nouns which are preceeded by prepositions, since this reduces
    #the likelihood that the noun is the subject of a post.
    grammar = r"""NP: {<J.*><IN>?<N.*><V.*>?<J.*>?}
                    {<J.*>?<IN>?<N.*><V.*><J.*>?}
                    }<J.*>?<IN><N.*><V.*>?<J.*>?{"""
    sentences  = [nltk.word_tokenize(str(sent)) for sent in sentences]
    sentences = [nltk.pos_tag(sent) for sent in sentences]#using a part-of-speech tagger
    #print sentences
    for i in range(len(sentences)):
        #using noun phrase chunking
        cp = nltk.RegexpParser(grammar)
        result = cp.parse(sentences[i])
        #processing Noun Phrase subtrees
        #result.draw()
        for subtree in result:
            if isinstance(subtree, nltk.tree.Tree):
                for j in subtree.subtrees():
                    for k in j:
                        if targetString in k:
                            isSub[i] = True               
    return isSub

def getTargetSentiment(text,isSub):
    sentences = getSentences(text)
    this_trueCounter=0
    for i in range(len(isSub)):
        if isSub[i]:
            this_trueCounter+=1
    targetSent = np.zeros((2,this_trueCounter))
    this_trueCounter=0
    for i in range(len(sentences)):
        if isSub[i]:
            this_sent = getSentiment(str(sentences[i]))
            
            targetSent[0][this_trueCounter] = this_sent[0]
            targetSent[1][this_trueCounter] = this_sent[1]
            this_trueCounter+=1
    return [targetSent[0].mean(), targetSent[1].mean()]

def generateSeries(df,index,isSub,colNames):
    this_body = df.body[index].encode('ascii','ignore')
    this_score = df.score[index]
    [this_pol, this_sub] = getSentiment(this_body)
    [this_stdPol, this_stdSub] = sentStd(this_body)
    [this_polRange, this_subRange] = sentRange(this_body)
    this_wordCount = getWordCount(this_body)
    this_bigWords = howPretentious(this_body,10)#flagging words > 10 (relative to > 3 chars)
    this_sentLen = getTerseness(this_body)
    [this_targetPol, this_targetSub] = getTargetSentiment(this_body,isSub)
    #return pd.Series([[this_body, this_score, this_pol, this_stdPol, this_sub, this_stdSub,
    #          this_polRange, this_subRange, this_wordCount, this_bigWords, this_sentLen,
    #          this_targetPol, this_targetSub]])#,columns=colNames)
    return [this_body, this_score, this_pol, this_stdPol, this_sub, this_stdSub,
              this_polRange, this_subRange, this_wordCount, this_bigWords, this_sentLen,
              this_targetPol, this_targetSub]

data = df_China.body
trueCounter=0

'''
for z in range(len(data)):
    sents = getSentences(data[z].encode('ascii','ignore'))
    #for i in sents:
    #    print i
    #howPretentious(data[0],10)
    hasSub=getSubjects(sents,'China')
    if True in hasSub:
        trueCounter+=1
        if trueCounter>=10 and trueCounter<=25:
            print data[z]
            print '\n\n'
'''

columnNames=['body','score','overallPol','stdPol','overallSub','stdSub','polRange','subRange',
             'wordCount','bigWords','sentLen','targetPol','targetSub']
for z in range(len(data)):
    hasSub=getSubjects(data[z].encode('ascii','ignore'),'China')
    
    if True in hasSub:
        trueCounter+=1

print trueCounter        

prealoc = range(trueCounter)
aboutChina_df = pd.DataFrame(index=prealoc,columns=columnNames)

trueCounter=0

#generating aboutChina DataFrame with attributes
for z in range(len(data)):
    hasSub=getSubjects(data[z].encode('ascii','ignore'),'China')
    
    if True in hasSub:
        #run analysis
        aboutChina_df.loc[trueCounter] = generateSeries(df_China,z,hasSub,columnNames)
        trueCounter+=1
#aboutChina_df = pd.DataFrame(aboutChina_df,columns=columnNames)
print trueCounter
aboutChina_df

2452
2452


Unnamed: 0,body,score,overallPol,stdPol,overallSub,stdSub,polRange,subRange,wordCount,bigWords,sentLen,targetPol,targetSub
0,I think China is in the running for first coun...,1159,0.25,,0.3333333,,0,0,16,0,16,0.25,0.3333333
1,Interesting that it mentions that being on the...,910,-0.03583333,0.2791648,0.3613889,0.3472624,0.7729167,0.8888889,115,0.0617284,23,0.01111111,0.1055556
2,He even road in a US jeep with US flags.. and ...,593,0.1890625,0.1600018,0.7,0.4253675,0.4,1,87,0.02173913,14.5,0,0
3,"To be fair, the US was built on British techno...",476,0.1333333,0.2828427,0.2666667,0.0942809,0.4,0.1333333,27,0.1538462,13.5,0.2333333,0.3
4,China is catching up fast.\n\nRIP Inbox,328,0.4,0.2828427,0.75,0.5303301,0.4,0.75,7,0,3.5,0.4,0.75
5,China is like that kid who hit a growth spurt ...,289,-0.1958333,0.3897393,0.7241667,0.4674185,1.089583,1,83,0,16.6,0,0
6,China has been putting a lot of effort into he...,275,-0.04166667,0.08333333,0.01666667,0.03333333,0.1666667,0.06666667,49,0.03225806,12.25,0,0
7,"China is basically saying ""fuck off"" to their ...",240,0.1045455,0.1663434,0.6409091,0.3391216,0.39375,0.9375,125,0.08860759,25,-0.2,0.5
8,But it comes with some strings attached. For ...,217,0.45,0.3,0.8125,0.5051815,0.6,1,48,0,16,0.45,0.8125
9,"Yeah, that whole 'we were first' thing that we...",214,0.005,0.1795023,0.5955556,0.3309256,0.5510417,1,83,0.05882353,6.916667,0.1,0.2


In [122]:
#creating Israel dataframe
df_Israel = getSubFrameByKeyword(df,'Israel')
print len(df_Israel)
df_Israel.head()

14145


Unnamed: 0,body,score
0,The US is positively going to continue to arm ...,2408
1,"This was already posted, but in a thread so lo...",1177
2,He's absolutely right. Palestinians definitely...,1063
3,Few more statistics:\n\n- 24% of British Musli...,865
4,1. Supported(s) imprisonment of whistle blower...,650


In [125]:
#creating Russia dataframe
df_Russia = getSubFrameByKeyword(df,'Russia')
print len(df_Russia)
df_Russia.head()

12600


Unnamed: 0,body,score
0,Well he shouldn't be concerned because Russia ...,3587
1,"Ah Russia, where dying for your country is a s...",2461
2,&gt; A group of eight men later arrived in a B...,1632
3,"Lol, it's not Russia dude, the tags were taken...",1628
4,"This is the ancestral Habsburg land, Austria c...",1502


In [None]:
'''
Good references for understanding the code/NLP in general:
http://billchambers.me/tutorials/2015/01/14/python-nlp-cheatsheet-nltk-scikit-learn.html

'''