In [6]:
import pandas as pd
import numpy as np
import re, reprlib, sys

import nltk as n
import nltk, nltk.classify.util, nltk.metrics, nltk.tokenize, nltk.stem
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.classify import MaxentClassifier
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures as BAM
from nltk.probability import FreqDist, ConditionalFreqDist
from nltk.classify.scikitlearn import SklearnClassifier

from itertools import chain

import sklearn as sk
from sklearn import cross_validation
from sklearn.feature_extraction.text import *
from sklearn.svm import *
from sklearn.pipeline import *
from sklearn.multiclass import *
from sklearn.naive_bayes import *

# n.download()

### Read data and show them

In [12]:
data = pd.read_csv("data.csv", parse_dates=True, infer_datetime_format=True, 
            sep = None, encoding = "latin-1", engine = "python")
data.head(n=2)

FileNotFoundError: [Errno 2] No such file or directory: '/home/jm/home/jm/Documents/caseSolvingSeminar/R_scripts_and_notebooks/raw_data/data.csv'

In [3]:
# data.describe()

In [11]:
# plit data
train, test = sk.cross_validation.train_test_split(data, train_size = 0.66)
print(len(test))
print(len(train))

# http://billchambers.me/tutorials/2015/01/14/python-nlp-cheatsheet-nltk-scikit-learn.html
# http://glowingpython.blogspot.de/2013/07/combining-scikit-learn-and-ntlk.html
# http://www.cs.duke.edu/courses/spring14/compsci290/assignments/lab02.html
# https://stackoverflow.com/questions/10526579/use-scikit-learn-to-classify-into-multiple-categories
# https://github.com/anuraagvak/IRE-PersonalityRecognition-Final/blob/master/ire_report.pdf
# https://github.com/Charudatt89/Personality_Recognition/blob/master/22-9-PersonalityRecognition/Report/Report.pdf

#classif = SklearnClassifier(LinearSVC())
#classif.train(train)

#fdist1 = n.FreqDist(list_of_rows_CD)
#fdist1.most_common(5)
#fdist1.hapaxes()
#set(w.lower() for w in list_of_rows)

3372
6545


### Combine all the strings for each author


In [12]:
combined_dataset = train.groupby(['#AUTHID'])['STATUS'].apply(lambda x: ' '.join(x)).reset_index()
combined_dataset.to_csv(path_or_buf="train.csv")
combined_dataset.head(n=3)

Unnamed: 0,#AUTHID,STATUS
0,00419a4c96b32cd63b2c7196da761274,back in cali!!!
1,02c37028a782cfda660c7243e45244bb,"Tentative Examination Schedule, Semester 1, 20..."
2,03133a828cd0cf52e3752813ce5d818f,"Did *PROPNAME* in 16:37, made money in Vegas, ..."


### Our main column consists of the rows of sentences. One after another.
#### Make all words lower case

In [13]:
lines_of_combined_dataset = combined_dataset["STATUS"].str.lower()
lines_of_trained_dataset = train["STATUS"].str.lower()

# print(lines_of_combined_dataset.head(n=5))

### Count how many unique values are there in the first column ?
#### Basically number of users

In [15]:
unique_values = train[train.columns[0]]
print(pd.Series.nunique(unique_values))

pd.Series.nunique(test[test.columns[0]])

242


231

In [16]:
# https://stackoverflow.com/questions/21633580/pandas-counting-unique-values-in-a-dataframe
data_unique_values = pd.value_counts(unique_values.values.ravel())

### Now, let's split words for each sentence and make real sentences (python-usable objects) from rows.

<strong>Rules</strong>
    * Everything counts (but no weird symbols incl. punc.)
    * No use of lemmalization, deleting words etc.

In [20]:
# http://www.nltk.org/api/nltk.tokenize.html#module-nltk.tokenize
twt = n.TreebankWordTokenizer()

# Create lists
list_of_rows_CD = [l.split("\n")[0] for l in lines_of_trained_dataset]
#print(list_of_rows_CD)
list_of_splitted_words_CD, list_of_sentences = [], []

for k in list_of_rows_CD:
    list_of_sentences.append(n.sent_tokenize(k))
    
for kCD in list_of_rows_CD:
    list_of_splitted_words_CD.append(n.word_tokenize(kCD))

In [18]:
print(list_of_splitted_words_CD[:2], "\n\n", list_of_rows_CD[:2], "\n\n", list_of_sentences[:2])

[['is', 'still', 'rather', 'fluish', '.'], ['is', 'watching', 'a', 'movie', ':', ')']] 

 ['is still rather fluish.', 'is watching a movie:)'] 

 [['is still rather fluish.'], ['is watching a movie:)']]


In [11]:
# https://stackoverflow.com/questions/2058985/python-count-sub-lists-in-nested-list
print("there are", sum(len(x) for x in list_of_splitted_words_CD), "tokens in", len(list_of_sentences), "sentences")

there are 117375 tokens in 6545 sentences


In [12]:
#print(list_of_rows_CD[:2], "\n")
#print(list_of_sentences[:2],"\n") 
#print(list_of_splitted_words_CD[:2], "\n")

### Different tokenizers (will decide later on which one)

Source: http://text-processing.com/demo/tokenize/


<!-- <img src="dif_tokenizers.png"> -->

### Goal: Delete stopwords from each row

#### And store them again in each row (= nested list)

In [13]:
# stopwords.words("english") 
# https://stackoverflow.com/questions/19249201/how-to-create-a-number-of-empty-nested-lists-in-python
# english_stops_list = list(set(stopwords.words('english')))

In [14]:
def delete_stopwords():
    english_stops = set(stopwords.words('english'))
    nest_list_without_stopwords = [[] for _ in range(len(list_of_splitted_words_CD))]
    for sentence in list_of_splitted_words_CD: 
        for word in sentence:
            if word not in english_stops:
                nest_list_without_stopwords[list_of_splitted_words_CD.index(sentence)].append(word)
    return nest_list_without_stopwords

In [15]:
nest_list_without_stopwords = delete_stopwords()

print(reprlib.repr(list_of_splitted_words_CD), "\n")
print(reprlib.repr(nest_list_without_stopwords))

[['is', 'glad', 'that', 'we', 'waited', 'to', ...], ['is', 'stfo', '.'], ['ice', 'skating', 'in', 'houston', '=', 'melted', ...], ['ran', 'back', 'for', 'her', 'laptop', 'when', ...], ['done', 'na', 'jud', 'sa', 'ppt', '.', ...], ['you', 'are', 'beautiful', ',', 'sir', '.', ...], ...] 

[['glad', 'waited', 'watch', 'wolverine', 'rental', '.'], ['stfo', '.'], ['ice', 'skating', 'houston', '=', 'melted', 'ice', ...], ['ran', 'back', 'laptop', 'firebell', 'went', 'etcheverry', ...], ['done', 'na', 'jud', 'sa', 'ppt', '.', ...], ['beautiful', ',', 'sir', '.', 'hope', "'re", ...], ...]


### SnowballStemmer 

##### https://stackoverflow.com/questions/10554052/what-are-the-major-differences-and-benefits-of-porter-and-lancaster-stemming-alg

> Stemming is a technique to remove affixes from a word, ending up with the stem. For
example, the stem of cooking is cook, and a good stemming algorithm knows that the ing
suffix can be removed.


### Lemmatization 
is very similar to stemming, but is more akin to synonym replacement. A lemma is a root word, as opposed to the root stem. So unlike stemming, you are always left with a valid word that means the same thing. However, the word you end up with can be completely different.

### Compare different techniques

In [16]:
# Huge nested lists
# http://www.nltk.org/api/nltk.stem.html#module-nltk.stem.wordnet

nest_list_without_stopwords_ss = [[] for _ in range(len(nest_list_without_stopwords))]
nest_list_without_stopwords_pp = [[] for _ in range(len(nest_list_without_stopwords))]
nest_list_without_stopwords_ls = [[] for _ in range(len(nest_list_without_stopwords))]
nest_list_without_stopwords_lm = [[] for _ in range(len(nest_list_without_stopwords))]

ss = n.stem.SnowballStemmer("english")
pp = n.stem.PorterStemmer()
ls = n.stem.LancasterStemmer()
lm = n.stem.WordNetLemmatizer()

for sentence in nest_list_without_stopwords: 
    for word in sentence:
        nest_list_without_stopwords_ss[nest_list_without_stopwords.index(sentence)].append(ss.stem(word))
        nest_list_without_stopwords_pp[nest_list_without_stopwords.index(sentence)].append(pp.stem(word))
        nest_list_without_stopwords_lm[nest_list_without_stopwords.index(sentence)].append(lm.lemmatize(word, pos="v"))
        nest_list_without_stopwords_ls[nest_list_without_stopwords.index(sentence)].append(ls.stem(word))

print("Original \n", reprlib.repr(nest_list_without_stopwords[:6]), "\n")
print("SnowballStemmer \n", reprlib.repr(nest_list_without_stopwords_ss), "\n")
print("Porter \n", reprlib.repr(nest_list_without_stopwords_pp), "\n")
print("Lemmatizer \n", reprlib.repr(nest_list_without_stopwords_lm), "\n")
print("LancasterStemmer \n", reprlib.repr(nest_list_without_stopwords_ls), "\n")

Original 
 [['glad', 'waited', 'watch', 'wolverine', 'rental', '.'], ['stfo', '.'], ['ice', 'skating', 'houston', '=', 'melted', 'ice', ...], ['ran', 'back', 'laptop', 'firebell', 'went', 'etcheverry', ...], ['done', 'na', 'jud', 'sa', 'ppt', '.', ...], ['beautiful', ',', 'sir', '.', 'hope', "'re", ...]] 

SnowballStemmer 
 [['glad', 'wait', 'watch', 'wolverin', 'rental', '.'], ['stfo', '.'], ['ice', 'skate', 'houston', '=', 'melt', 'ice', ...], ['ran', 'back', 'laptop', 'firebel', 'went', 'etcheverri', ...], ['done', 'na', 'jud', 'sa', 'ppt', '.', ...], ['beauti', ',', 'sir', '.', 'hope', 're', ...], ...] 

Porter 
 [['glad', 'wait', 'watch', 'wolverin', 'rental', '.'], ['stfo', '.'], ['ice', 'skate', 'houston', '=', 'melt', 'ice', ...], ['ran', 'back', 'laptop', 'firebel', 'went', 'etcheverri', ...], ['done', 'na', 'jud', 'sa', 'ppt', '.', ...], ['beauti', ',', 'sir', '.', 'hope', "'re", ...], ...] 

Lemmatizer 
 [['glad', 'wait', 'watch', 'wolverine', 'rental', '.'], ['stfo', '.'], 

### Removing repeating characters

In [17]:
# book 38
class RepeatReplacer(object):
    def __init__(self):
        self.repeat_regexp = re.compile(r'(\w*)(\w)\2(\w*)')
        self.repl = r'\1\2\3'
        
    def replace(self, word):        
        if wordnet.synsets(word):
            return word
    
        repl_word = self.repeat_regexp.sub(self.repl, word)

        if repl_word != word:
            return self.replace(repl_word)
        else:
            return repl_word
        
    def delete_stupid_chars(self, word):
        """
        http://stackoverflow.com/a/3874768
        used above
        """
        replaced_word = self.replace(word)
        rem = "!?#.,();:'[].,//``...~<>$%^&*-_-=+"
        return replaced_word.translate(str.maketrans(dict.fromkeys(rem)))

replacer = RepeatReplacer()
replacer.delete_stupid_chars("!?sdf,word!??)()")

'sdfword'

In [18]:
def delete_repChars():
    nest_list_without_stopwords_lm_repchars = [[] for _ in range(len(nest_list_without_stopwords_lm))]
    for sentence in nest_list_without_stopwords_lm:
        for word in sentence:
            nest_list_without_stopwords_lm_repchars[nest_list_without_stopwords_lm.index(sentence)].append(replacer.delete_stupid_chars(word))
    return nest_list_without_stopwords_lm_repchars

In [19]:
nest_list_without_stopwords_lm_repchars = delete_repChars()
print(reprlib.repr(nest_list_without_stopwords_lm_repchars))

[['glad', 'wait', 'watch', 'wolverine', 'rental', ''], ['stfo', ''], ['ice', 'skate', 'houston', '', 'melt', 'ice', ...], ['run', 'back', 'laptop', 'firebel', 'go', 'etchevery', ...], ['do', 'na', 'jud', 'sa', 'pt', '', ...], ['beautiful', '', 'sir', '', 'hope', 're', ...], ...]


In [20]:
#[nest_list_without_stopwords_lm_repchars for nest_list_without_stopwords_lm_repchars in nest_list_without_stopwords_lm_repchars if nest_list_without_stopwords_lm_repchars]
#    nest_list_without_stopwords_lm_repchars_as = [[] for _ in range(len(nest_list_without_stopwords_lm_repchars))]
#    for sentence in nest_list_without_stopwords_lm_repchars:
#        nest_list_without_stopwords_lm_repchars_as[nest_list_without_stopwords_lm_repchars.index(sentence)].append(list(filter(None, sentence)))   
#https://stackoverflow.com/questions/973568/convert-nested-lists-to-string

In [21]:
def delete_empty_strings():
    nest = [[] for _ in range(len(nest_list_without_stopwords_lm_repchars))]
    for sentence in nest_list_without_stopwords_lm_repchars: 
        for word in sentence:
            if word != '':
                nest[nest_list_without_stopwords_lm_repchars.index(sentence)].append(word)
    return nest

In [22]:
nest = delete_empty_strings()
outlst = [' '.join([str(c) for c in hm]) for hm in nest]

#print(outlst)
# print(len(nest_list_without_stopwords_lm_repchars[61]), "\n", len(nest_list_without_stopwords_lm[61]))

In [23]:
#http://aylien.com/web-summit-2015-tweets-part1
vectorizer = TfidfVectorizer(min_df=4, max_features = 10000)
vz = vectorizer.fit_transform(outlst)
tfidf = dict(zip(vectorizer.get_feature_names(), vectorizer.idf_))
print("devil: " + str(tfidf["devil"]))

devil: 7.30170280527


### Create tagged words from all sentences

In [24]:
#print(reprlib.repr(nest_list_without_stopwords_lm))

print(list_of_rows_CD[0])
print(list_of_splitted_words_CD[0])
print(list_of_sentences[0], "\n")

tag_words = n.pos_tag(list_of_splitted_words_CD)
print(tag_words, "\n")
# print(reprlib.repr(tagged_words()))


is glad that we waited to watch wolverine as a rental.
['is', 'glad', 'that', 'we', 'waited', 'to', 'watch', 'wolverine', 'as', 'a', 'rental', '.']
['is glad that we waited to watch wolverine as a rental.'] 



AttributeError: 'list' object has no attribute 'isdigit'

In [25]:
nest_list_tagged_words = [[] for _ in range(len(nest_list_without_stopwords_lm_repchars))]

#def tagged_words():
#    for sentence in nest_list_without_stopwords_lm_repchars:
#        for words in sentence:
#            nest_list_tagged_words[nest_list_without_stopwords_lm_repchars.index(sentence)].append(n.pos_tag(words))
#    return nest_list_tagged_words


### Unigram features

Use -a for code analysis

In [26]:
def word_fea(words):
    return dict((word, True))
word_fea(outlst)

ValueError: dictionary update sequence element #0 has length 1; 2 is required

In [27]:
# Bigram collocation
# https://github.com/neotenic/cancer/blob/master/nltk.ipynb
def bigram_features(words, score_fn=BAM.chi_sq): 
    bg_finder = BigramCollocationFinder.from_words(words) 
    bigrams = bg_finder.nbest(score_fn, 100000) 
    return dict((bg, True) for bg in chain(words, bigrams)) 

#bigram_features(outlst, score_fn=BAM.chi_sq)

In [28]:
classifier = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', OneVsRestClassifier(LinearSVC()))])

In [29]:
cl = n.NaiveBayesClassifier.train(train)
print(n.classify.accuracy(cl, test),"%.3f")
cl.show_most_informative_features(40)
cl.prob_classify(featurize(name)) #

ValueError: too many values to unpack (expected 2)