In [1]:
import pandas as pd
import numpy as np
import re, reprlib, sys

%load_ext cython
%load_ext cythonmagic

import nltk as n
import nltk, nltk.classify.util, nltk.metrics, nltk.tokenize, nltk.stem, nltk.book
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.classify import MaxentClassifier
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures as BAM
from nltk.probability import FreqDist, ConditionalFreqDist
from nltk.classify.scikitlearn import SklearnClassifier

from itertools import chain

import sklearn as sk
from sklearn import cross_validation
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC

# n.download()

*** Introductory Examples for the NLTK Book ***
Loading text1, ..., text9 and sent1, ..., sent9
Type the name of the text or sentence to view it.
Type: 'texts()' or 'sents()' to list the materials.
text1: Moby Dick by Herman Melville 1851
text2: Sense and Sensibility by Jane Austen 1811
text3: The Book of Genesis
text4: Inaugural Address Corpus
text5: Chat Corpus
text6: Monty Python and the Holy Grail
text7: Wall Street Journal
text8: Personals Corpus
text9: The Man Who Was Thursday by G . K . Chesterton 1908




### Read data and show them

In [2]:
data = pd.read_csv("data.csv", parse_dates=True, infer_datetime_format=True, 
            sep = None, encoding = "latin-1", engine = "python")
data.head()

Unnamed: 0,#AUTHID,STATUS,sEXT,sNEU,sAGR,sCON,sOPN,cEXT,cNEU,cAGR,cCON,cOPN,DATE,NETWORKSIZE,BETWEENNESS,NBETWEENNESS,DENSITY,BROKERAGE,NBROKERAGE,TRANSITIVITY
0,b7b7764cfa1c523e4e93ab2a79a946c4,likes the sound of thunder.,2.65,3,3.15,3.25,4.4,n,y,n,n,y,06/19/09 03:21 PM,180,14861.6,93.29,0.03,15661,0.49,0.1
1,b7b7764cfa1c523e4e93ab2a79a946c4,is so sleepy it's not even funny that's she ca...,2.65,3,3.15,3.25,4.4,n,y,n,n,y,07/02/09 08:41 AM,180,14861.6,93.29,0.03,15661,0.49,0.1
2,b7b7764cfa1c523e4e93ab2a79a946c4,is sore and wants the knot of muscles at the b...,2.65,3,3.15,3.25,4.4,n,y,n,n,y,06/15/09 01:15 PM,180,14861.6,93.29,0.03,15661,0.49,0.1
3,b7b7764cfa1c523e4e93ab2a79a946c4,likes how the day sounds in this new song.,2.65,3,3.15,3.25,4.4,n,y,n,n,y,06/22/09 04:48 AM,180,14861.6,93.29,0.03,15661,0.49,0.1
4,b7b7764cfa1c523e4e93ab2a79a946c4,is home. <3,2.65,3,3.15,3.25,4.4,n,y,n,n,y,07/20/09 02:31 AM,180,14861.6,93.29,0.03,15661,0.49,0.1


In [3]:
data.describe()

Unnamed: 0,sEXT,sNEU,sAGR,sCON,sOPN,NETWORKSIZE,BETWEENNESS,NBETWEENNESS,DENSITY,BROKERAGE,NBROKERAGE,TRANSITIVITY
count,9917.0,9917.0,9917.0,9917.0,9917.0,9917.0,9917.0,9917.0,9917.0,9917.0,9917.0,9916.0
mean,3.35476,2.609453,3.616643,3.474201,4.130386,429.37712,135425.315359,94.66517,3.154012,137642.476201,0.48992,0.128821
std,0.857578,0.760248,0.682485,0.737215,0.585672,428.760382,199433.803497,5.506696,311.073343,201392.066555,0.011908,0.106063
min,1.33,1.25,1.65,1.45,2.25,24.0,93.25,0.04,0.0,0.49,0.18,0.0
25%,2.71,2.0,3.14,3.0,3.75,196.0,16902.2,93.77,0.01,17982.0,0.49,0.06
50%,3.4,2.6,3.65,3.4,4.25,317.0,47166.9,96.44,0.02,48683.0,0.49,0.09
75%,4.0,3.05,4.15,4.0,4.55,633.0,196606.0,97.88,0.03,198186.0,0.5,0.17
max,5.0,4.75,5.0,5.0,5.0,29724.9,1251780.0,99.82,30978.0,1263790.0,0.5,0.63


### Combine all the strings for each author


In [4]:
combined_dataset = data.groupby(['#AUTHID'])['STATUS'].apply(lambda x: ' '.join(x)).reset_index()
combined_dataset.to_csv(path_or_buf="oneBigString.csv")
combined_dataset.head()

Unnamed: 0,#AUTHID,STATUS
0,00419a4c96b32cd63b2c7196da761274,back in cali!!!
1,02c37028a782cfda660c7243e45244bb,Supervisor: *PROPNAME* (second preference) Res...
2,03133a828cd0cf52e3752813ce5d818f,Did Cindy 30 times in 20 minutes and GI jane i...
3,03e6c4eca4269c183fa0e1780f73faba,"""Those who criticize our generation forget who..."
4,06b055f8e2bca96496514891057913c3,is enjoying the cricket...comfy boxers and rai...


### Count how many unique values are there in the first column ?
#### Basically number of users

In [5]:
unique_values = data[data.columns[0]]
pd.Series.nunique(unique_values)
#len(set(unique_values))

250

In [6]:
# https://stackoverflow.com/questions/21633580/pandas-counting-unique-values-in-a-dataframe
pd.value_counts(unique_values.values.ravel())

e6cdef6f475cce3023c5b715f8c9f110    223
6f2bebc01062eb8334dccba3e048fdb5    219
527ed53d2ba3a3bc417b8402d5b2f556    194
d7e500ad854a1b6ced39e53a525b8a6d    184
0737e4e4980f56c9fb1cb5743001c917    172
502db2fcfe26705ae16a46c5cb2ad2e5    165
b4a21c82de4011033c8ac67081ff939c    162
b2be41464b53ffc6deae9536ddfd3aee    159
c3f4b3e345cb6b032db2e0459d179db3    153
715c9eb832dc833a0b6409ddccd268b1    151
f7456ac4e6b20911c40fdad18908a8d2    150
0bfa3d952ffed50f25011b128e73a820    141
dbdfbfda2a4205bd59b22758ceddd5af    126
e4a512374eee079d2b8acc2ce69990d5    126
f2026b8cb48aff9af31577ecbfda5c38    123
e465fadd8b30e8669f397e32e10f6cd0    118
181962441153a36333f0c60701823412    114
8d7faa6d7f104a6cb7c4a9e1c6310a15    114
d39c2b0fb2e50e37795fdbe3b8cd3792    113
eb7f8081aa0bd4004f513d3299db9063    107
521896b01c1a506dc4404e600fa99c5b    104
dba5f5266d03dd6d4db084ad7dbc683c    102
b7b7764cfa1c523e4e93ab2a79a946c4    102
3d7847b1c33b5f5811208b4aa1a7ffbd    101
c5d9ffcb242053b0abdebe0d684fea3a     99


### Our main column consists of the rows of sentences. One after another.
#### Make all words lower case

In [7]:
lines_of_data = data["STATUS"].str.lower()
print(lines_of_data.head(n=5))

lines_of_combined_dataset = combined_dataset["STATUS"].str.lower()
print(lines_of_combined_dataset.head(n=5))

0                          likes the sound of thunder.
1    is so sleepy it's not even funny that's she ca...
2    is sore and wants the knot of muscles at the b...
3           likes how the day sounds in this new song.
4                                          is home. <3
Name: STATUS, dtype: object
0                                      back in cali!!!
1    supervisor: *propname* (second preference) res...
2    did cindy 30 times in 20 minutes and gi jane i...
3    "those who criticize our generation forget who...
4    is enjoying the cricket...comfy boxers and rai...
Name: STATUS, dtype: object


### Now, let's split words for each sentence and make real sentences (python-usable objects) from rows.

In [8]:
# http://www.nltk.org/api/nltk.tokenize.html#module-nltk.tokenize
twt = n.TreebankWordTokenizer()
english = n.data.load('tokenizers/punkt/PY3/english.pickle')

# Create lists
list_of_rows = [l.split("\n")[0] for l in lines_of_data]
list_of_rows_CD = [l.split("\n")[0] for l in lines_of_combined_dataset]


list_of_splitted_words, list_of_splitted_words_CD, list_of_splitted_words_twt, list_of_splitted_words_faster = [], [], [], []
list_of_sentences, list_of_sentences_twt = [], []

for k in list_of_rows:
    list_of_splitted_words.append(n.word_tokenize(k))
    list_of_splitted_words_faster.append(english.tokenize(k))
    list_of_sentences.append(n.sent_tokenize(k))
    

for kCD in list_of_rows_CD:
    list_of_splitted_words_CD.append(n.word_tokenize(kCD))


In [9]:
print(list_of_rows[:2], "\n", list_of_rows_CD[:2])

['likes the sound of thunder.', "is so sleepy it's not even funny that's she can't get to sleep."] 
 ['back in cali!!!', 'supervisor: *propname* (second preference) research area: regional economic integration (fifth prefernece) \x85\x85  tentative examination schedule, semester 1, 2009//10. . abcdefghijklmnopqrstuvwxyz qwertyuiopasdfghjklzxcvbnm mnbvcxzlkjhgfdsapoiuytrewq pa><dol x 2 cartography+select. topics in the geography of china     (????f?add?...) dea dea dea :d']


In [10]:
# https://stackoverflow.com/questions/2058985/python-count-sub-lists-in-nested-list
print("there are", sum(len(x) for x in list_of_splitted_words), "tokens and there are", 
      len(list_of_sentences), "sentences \n")

there are 177474 tokens and there are 9917 sentences 



In [11]:
#' This is the method that is invoked by word_tokenize(). It assumes that the text has already 
#' been segmented into sentences, e.g. using sent_tokenize().

print("They both will be same")
print(list_of_rows[:2], "\n")
print(list_of_sentences[:2]) #, " \n", list_of_sentences_twt[:2], "\n")
print(list_of_splitted_words[:2]) #, " \n", list_of_splitted_words_twt[:2])

They both will be same
['likes the sound of thunder.', "is so sleepy it's not even funny that's she can't get to sleep."] 

[['likes the sound of thunder.'], ["is so sleepy it's not even funny that's she can't get to sleep."]]
[['likes', 'the', 'sound', 'of', 'thunder', '.'], ['is', 'so', 'sleepy', 'it', "'s", 'not', 'even', 'funny', 'that', "'s", 'she', 'ca', "n't", 'get', 'to', 'sleep', '.']]


### Different tokenizers (will decide later on which one)

Source: http://text-processing.com/demo/tokenize/


<img src="dif_tokenizers.png">

### Goal: Delete stopwords from each row

#### And store them again in each row (= nested list)

In [12]:
# stopwords.words("english") 
# https://stackoverflow.com/questions/19249201/how-to-create-a-number-of-empty-nested-lists-in-python

print(list_of_splitted_words[:2])
english_stops_list = list(set(stopwords.words('english')))

[['likes', 'the', 'sound', 'of', 'thunder', '.'], ['is', 'so', 'sleepy', 'it', "'s", 'not', 'even', 'funny', 'that', "'s", 'she', 'ca', "n't", 'get', 'to', 'sleep', '.']]


In [13]:
def delete_stopwords():
    english_stops = set(stopwords.words('english'))
    nest_list_without_stopwords = [[] for _ in range(len(list_of_splitted_words))]
    for sentence in list_of_splitted_words: 
        for word in sentence:
            if word not in english_stops:
                nest_list_without_stopwords[list_of_splitted_words.index(sentence)].append(word)
    return nest_list_without_stopwords

In [14]:
nest_list_without_stopwords = delete_stopwords()     
print(reprlib.repr(nest_list_without_stopwords))

[['likes', 'sound', 'thunder', '.'], ['sleepy', "'s", 'even', 'funny', "'s", 'ca', ...], ['sore', 'wants', 'knot', 'muscles', 'base', 'neck', ...], ['likes', 'day', 'sounds', 'new', 'song', '.'], ['home', '.', '<', '3'], ['www.thejokerblogs.com'], ...]


### SnowballStemmer 

##### https://stackoverflow.com/questions/10554052/what-are-the-major-differences-and-benefits-of-porter-and-lancaster-stemming-alg

> Stemming is a technique to remove affixes from a word, ending up with the stem. For
example, the stem of cooking is cook, and a good stemming algorithm knows that the ing
suffix can be removed.


### Lemmatization 
is very similar to stemming, but is more akin to synonym replacement. A lemma is a root word, as opposed to the root stem. So unlike stemming, you are always left with a valid word that means the same thing. However, the word you end up with can be completely different.

### Compare different techniques

In [15]:
# Huge nested lists
# http://www.nltk.org/api/nltk.stem.html#module-nltk.stem.wordnet

nest_list_without_stopwords_ss = [[] for _ in range(len(nest_list_without_stopwords))]
nest_list_without_stopwords_pp = [[] for _ in range(len(nest_list_without_stopwords))]
nest_list_without_stopwords_ls = [[] for _ in range(len(nest_list_without_stopwords))]
nest_list_without_stopwords_lm = [[] for _ in range(len(nest_list_without_stopwords))]

ss = n.stem.SnowballStemmer("english")
pp = n.stem.PorterStemmer()
ls = n.stem.LancasterStemmer()
lm = n.stem.WordNetLemmatizer()

for sentence in nest_list_without_stopwords: 
    for word in sentence:
        nest_list_without_stopwords_ss[nest_list_without_stopwords.index(sentence)].append(ss.stem(word))
        nest_list_without_stopwords_pp[nest_list_without_stopwords.index(sentence)].append(pp.stem(word))
        nest_list_without_stopwords_lm[nest_list_without_stopwords.index(sentence)].append(lm.lemmatize(word, pos="v"))
        nest_list_without_stopwords_ls[nest_list_without_stopwords.index(sentence)].append(ls.stem(word))

print("Original \n", reprlib.repr(nest_list_without_stopwords[:6]), "\n")
print("SnowballStemmer \n", reprlib.repr(nest_list_without_stopwords_ss), "\n")
print("Porter \n", reprlib.repr(nest_list_without_stopwords_pp), "\n")
print("Lemmatizer \n", reprlib.repr(nest_list_without_stopwords_lm), "\n")
print("LancasterStemmer \n", reprlib.repr(nest_list_without_stopwords_ls), "\n")

# var1, var2, var3, var4 = [], [], [], []
# for word in nest_list_without_stopwords[9]: 
#     var1.append(ss.stem(word))
#     var2.append(pp.stem(word))
#     var3.append(lm.lemmatize(word, pos="v")) # or n -> makes difference!
#     var4.append(ls.stem(word))
# print(var1, "\n", var2, "\n", var3, "\n", var4)

Original 
 [['likes', 'sound', 'thunder', '.'], ['sleepy', "'s", 'even', 'funny', "'s", 'ca', ...], ['sore', 'wants', 'knot', 'muscles', 'base', 'neck', ...], ['likes', 'day', 'sounds', 'new', 'song', '.'], ['home', '.', '<', '3'], ['www.thejokerblogs.com']] 

SnowballStemmer 
 [['like', 'sound', 'thunder', '.'], ['sleepi', "'s", 'even', 'funni', "'s", 'ca', ...], ['sore', 'want', 'knot', 'muscl', 'base', 'neck', ...], ['like', 'day', 'sound', 'new', 'song', '.'], ['home', '.', '<', '3'], ['www.thejokerblogs.com'], ...] 

Porter 
 [['like', 'sound', 'thunder', '.'], ['sleepi', "'s", 'even', 'funni', "'s", 'ca', ...], ['sore', 'want', 'knot', 'muscl', 'base', 'neck', ...], ['like', 'day', 'sound', 'new', 'song', '.'], ['home', '.', '<', '3'], ['www.thejokerblogs.com'], ...] 

Lemmatizer 
 [['like', 'sound', 'thunder', '.'], ['sleepy', "'s", 'even', 'funny', "'s", 'ca', ...], ['sore', 'want', 'knot', 'muscle', 'base', 'neck', ...], ['like', 'day', 'sound', 'new', 'song', '.'], ['home', '

### Removing repeating characters

In [None]:
# book 38
class RepeatReplacer(object):
    def __init__(self):
        self.repeat_regexp = re.compile(r'(\w*)(\w)\2(\w*)')
        self.repl = r'\1\2\3'
        
    def replace(self, word):
        if wordnet.synsets(word):
            return word
    
        repl_word = self.repeat_regexp.sub(self.repl, word)

        if repl_word != word:
            return self.replace(repl_word)
        else:
            return repl_word
        
print(nest_list_without_stopwords_lm[61])
replacer = RepeatReplacer()

['complete', 'love', 'blackberry', 'mom', 'surprise', '.', 'yes', '.', 'excite', ',', 'cap', 'completely', 'necessary', '.', '<', '3', '<', '3', '<', '3', '<', '3', '<', '3', '<', '3']


In [None]:
def delete_repChars():
    nest_list_without_stopwords_lm_repchars = [[] for _ in range(len(nest_list_without_stopwords_lm))]
    for sentence in nest_list_without_stopwords_lm:
        for word in sentence:
            nest_list_without_stopwords_lm_repchars[nest_list_without_stopwords_lm.index(sentence)].append(replacer.replace(word))
    return nest_list_without_stopwords_lm_repchars

print(reprlib.repr(delete_repChars()))

In [None]:
nest_list_without_stopwords_lm_repchars = delete_repChars()

### Create tagged words from all sentences

In [None]:
#print(reprlib.repr(nest_list_without_stopwords_lm))

print(list_of_rows[0])
print(list_of_splitted_words[0])
print(list_of_sentences[0], "\n")
len(list_of_splitted_words)

tag_words = n.pos_tag(list_of_splitted_words[0])
print(tag_words, "\n")
# print(reprlib.repr(tagged_words()))


In [None]:
nest_list_tagged_words = [[] for _ in range(len(nest_list_without_stopwords_lm_repchars))]

#def tagged_words():
#    for sentence in nest_list_without_stopwords_lm_repchars:
#        for words in sentence:
#            nest_list_tagged_words[nest_list_without_stopwords_lm_repchars.index(sentence)].append(n.pos_tag(words))
#    return nest_list_tagged_words


### Unigram features

Use -a for code analysis

In [None]:
def word_fea(words):
    for word in words:
        return dict((word, True))

In [None]:
word_fea(nest_list_without_stopwords_lm[9547])

In [None]:
# Bigram collocation
# https://github.com/neotenic/cancer/blob/master/nltk.ipynb
def bigram_features(words, score_fn=BAM.chi_sq): 
    bg_finder = BigramCollocationFinder.from_words(words) 
    bigrams = bg_finder.nbest(score_fn, 100000) 
    return dict((bg, True) for bg in chain(words, bigrams)) 


bigram_features(list_of_splitted_words[1], score_fn=BAM.chi_sq)

In [None]:
# plit data
train, test = sk.cross_validation.train_test_split(data, train_size = 0.8)

token_dict = {}
# tfidf = TfidfVectorizer(input=nest_list_without_stopwords_lm_repchars[0])
# tfs = tfidf.fit_transform(token_dict.values())

# http://billchambers.me/tutorials/2015/01/14/python-nlp-cheatsheet-nltk-scikit-learn.html
# http://glowingpython.blogspot.de/2013/07/combining-scikit-learn-and-ntlk.html
# http://www.cs.duke.edu/courses/spring14/compsci290/assignments/lab02.html
# https://stackoverflow.com/questions/10526579/use-scikit-learn-to-classify-into-multiple-categories
# https://github.com/anuraagvak/IRE-PersonalityRecognition-Final/blob/master/ire_report.pdf
# https://github.com/Charudatt89/Personality_Recognition/blob/master/22-9-PersonalityRecognition/Report/Report.pdf

classif = SklearnClassifier(LinearSVC())
classif.train(train)

fdist1 = n.FreqDist(list_of_rows)
#fdist1.most_common(5)
#fdist1.hapaxes()
#set(w.lower() for w in list_of_rows)