In [91]:
import pandas as pd

In [92]:
l_raw = pd.read_excel("data/LData.xlsx", index_col='Unnamed: 0')
l_raw.head(1)

Unnamed: 0,app_name,user_id,user_name,date,country,version,score,topic,review,url,review_id,category_final,sentiment_final,req_final
52505.0,Facebook,246193109,Help is herr,2017-06-21 00:00:00,United States,97,1,Notifications not showing up,The notification badges are showing up on my i...,https://itunes.apple.com/WebObjects/MZStore.wo...,53163.0,requirement,neutral,functional


In [93]:
l_raw.info()
la = l_raw

<class 'pandas.core.frame.DataFrame'>
Float64Index: 3000 entries, 52505.0 to 45413.0
Data columns (total 14 columns):
app_name           3000 non-null object
user_id            3000 non-null object
user_name          2993 non-null object
date               3000 non-null object
country            3000 non-null object
version            3000 non-null object
score              3000 non-null int64
topic              2994 non-null object
review             2996 non-null object
url                2850 non-null object
review_id          2850 non-null float64
category_final     3000 non-null object
sentiment_final    2796 non-null object
req_final          1081 non-null object
dtypes: float64(1), int64(1), object(12)
memory usage: 351.6+ KB


# DATA CLEANING

* Reindex

In [94]:
la = la.reset_index(drop=True)
la.head()

Unnamed: 0,app_name,user_id,user_name,date,country,version,score,topic,review,url,review_id,category_final,sentiment_final,req_final
0,Facebook,246193109,Help is herr,2017-06-21 00:00:00,United States,97,1,Notifications not showing up,The notification badges are showing up on my i...,https://itunes.apple.com/WebObjects/MZStore.wo...,53163.0,requirement,neutral,functional
1,Facebook,43034279,javamdnss,2017-06-16 00:00:00,United States,97,1,Hate it!,Why do they make changes we don't need? Now th...,https://itunes.apple.com/WebObjects/MZStore.wo...,53905.0,other,very negative,
2,Facebook,496978255,,2017-05-27 00:00:00,Hong Kong,94,1,Useless function n poor experience,Story is useless n annoying to user. \nCan't s...,https://itunes.apple.com/WebObjects/MZStore.wo...,47401.0,other,negative,
3,Facebook,139595037,Gilbertiggy,2017-05-26 00:00:00,United Kingdom,94,1,To many updates!,This app is always having an update for someth...,https://itunes.apple.com/WebObjects/MZStore.wo...,42233.0,requirement,negative,functional
4,Facebook,180832062,Princess Lou 24,2017-06-01 00:00:00,United Kingdom,94,1,Photo albums,Just spent an hour trying to upload photos and...,https://itunes.apple.com/WebObjects/MZStore.wo...,42066.0,requirement,negative,non-functional


* Drop unneccessary columns

In [95]:
la = la.drop(['user_id', 'url', 'review_id', 'version', 'user_name', 'app_name', 'date', 'req_final'], axis=1)
la.head(1)

Unnamed: 0,country,score,topic,review,category_final,sentiment_final
0,United States,1,Notifications not showing up,The notification badges are showing up on my i...,requirement,neutral


* Rename label columns

In [96]:
la = la.rename(columns = {
    "category_final": "category",
    "sentiment_final": "sentiment"
}, errors="raise")
la.head(1)

Unnamed: 0,country,score,topic,review,category,sentiment
0,United States,1,Notifications not showing up,The notification badges are showing up on my i...,requirement,neutral


* Remove NaN rows

In [97]:
# l = l[l['review'].notna()] # take the rows that are not NaN
la = la.dropna() # remove any row with at least one value missing NaN
la.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2795 entries, 0 to 2999
Data columns (total 6 columns):
country      2795 non-null object
score        2795 non-null int64
topic        2795 non-null object
review       2795 non-null object
category     2795 non-null object
sentiment    2795 non-null object
dtypes: int64(1), object(5)
memory usage: 152.9+ KB


In [98]:
# https://docs.python.org/2/library/string.html
import string
import nltk
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import re

In [99]:
def remove_punctuation(data, column):
    """
    Uses string.punctuation list to remove unwarranted characeters
    !"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
    """
    punctuation_table = str.maketrans('', '', string.punctuation)
    stripped = [review.translate(punctuation_table) for review in data[column]]
    return stripped

def lowercase(data, column):
    review_arr = data[column].to_list()
    return [review.lower() for review in review_arr]

def asciionly(data, column):
    new_reviews = []
    for review in data[column]:
        review = re.sub('[^A-Za-z0-9\s]', '', review)
        new_reviews.append(review)
    return new_reviews

def new_line_to_space(data, column):
    new_reviews = []
    for review in data[column]:
        nr = review.replace('\n', ' ')
        nr = nr.replace('\t', '') # remove tab as well
        new_reviews.append(nr)
    return new_reviews

def remove_single_characters(data, column):
    new_reviews = []
    for review in data[column]:
        words = review.split(" ")
        new_words = []
        for index, word in enumerate(words):
            if (len(word) > 1):
                new_words.append(word)
        new_review = " ".join(word for word in new_words)
        new_reviews.append(new_review)
    return new_reviews

def remove_non_english(data, column):
    words = set(nltk.corpus.words.words())
    new_reviews = []
    for review in data[column]:
        new_review = " ".join(w for w in nltk.wordpunct_tokenize(review) if w.lower() in words or not w.isalpha())
        new_reviews.append(new_review)
    return new_reviews

def remove_stopwords(data, column):
    stop_words = stopwords.words('english')
    new_reviews = []
    for review in data[column]:
        new_words = []
        for word in review.split(" "):
            if word not in stop_words:
                new_words.append(word)
        new_review = " ".join(w for w in new_words)
        new_reviews.append(new_review)  
    return new_reviews

def tokenize_words(data, column):
    tokenized_reviews = []
    for review in data[column]:
        tokenized_reviews.append(word_tokenize(review))
    return tokenized_reviews

def stem(data, column):
    porter = PorterStemmer()
    new_reviews = []
    for review_tokens in data[column]:
        stemmed = [porter.stem(word) for word in review_tokens]
        new_reviews.append(stemmed)
    return new_reviews

def lemmatize(data, column):
    wnt = WordNetLemmatizer()
    new_reviews = []
    for review_tokens in data[column]:
        lemmas = []   
        for word in review_tokens:
            lemma = wnt.lemmatize(word, pos='v')
            lemmas.append(lemma)
        new_reviews.append(lemmas)
    return new_reviews

In [10]:
# testing
review = "The notification badges are showing up on my iphone when i don't want them to and stuff I can't believe this"
rdi = { 'review' : [review] }
te = pd.DataFrame(rdi)

In [11]:
te['review'] = tokenize_words(te, 'review')
te['review'][0]

['The',
 'notification',
 'badges',
 'are',
 'showing',
 'up',
 'on',
 'my',
 'iphone',
 'when',
 'i',
 'do',
 "n't",
 'want',
 'them',
 'to',
 'and',
 'stuff',
 'I',
 'ca',
 "n't",
 'believe',
 'this']

In [12]:
type(te['review'][0])

list

In [13]:
te = lemmatize(te, 'review')
te

[['The',
  'notification',
  'badge',
  'be',
  'show',
  'up',
  'on',
  'my',
  'iphone',
  'when',
  'i',
  'do',
  "n't",
  'want',
  'them',
  'to',
  'and',
  'stuff',
  'I',
  'ca',
  "n't",
  'believe',
  'this']]

In [14]:
la.head(1)

Unnamed: 0,country,score,topic,review,category,sentiment
0,United States,1,Notifications not showing up,The notification badges are showing up on my i...,requirement,neutral


In [15]:
norm_l = la.copy()
norm_l.head()

Unnamed: 0,country,score,topic,review,category,sentiment
0,United States,1,Notifications not showing up,The notification badges are showing up on my i...,requirement,neutral
1,United States,1,Hate it!,Why do they make changes we don't need? Now th...,other,very negative
2,Hong Kong,1,Useless function n poor experience,Story is useless n annoying to user. \nCan't s...,other,negative
3,United Kingdom,1,To many updates!,This app is always having an update for someth...,requirement,negative
4,United Kingdom,1,Photo albums,Just spent an hour trying to upload photos and...,requirement,negative


In [16]:
def normalize(data, column):
    data[column] = lowercase(data, column)
#     data[column] = remove_non_english(data, column)
    data[column] = remove_punctuation(data, column)
    data[column] = asciionly(data, column)
    data[column] = new_line_to_space(data, column)
    data[column] = remove_single_characters(data, column)
    data[column] = remove_stopwords(data, column)
    data[column] = tokenize_words(data, column)
#     data[column] = stem(data, column)
    data[column] = lemmatize(data, column)
    return data[column]

In [17]:
norm_l['review'] = normalize(norm_l, 'review')
norm_l.head(1)

Unnamed: 0,country,score,topic,review,category,sentiment
0,United States,1,Notifications not showing up,"[notification, badge, show, iphone, plus, open...",requirement,neutral


In [18]:
r = norm_l['review'][321]
r

['cant', 'remove', 'reactions', 'interface', 'unusable', 'stupid']

# CLASSIFIER TRAINING

In [19]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.preprocessing import LabelEncoder

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import model_selection, naive_bayes, svm
from sklearn.linear_model import SGDClassifier

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn import metrics

from sklearn.model_selection import cross_val_score

In [20]:
def collapse_labels(labels):
    new_labels = []
    pos = ['positive', 'very positive', 'positve', 'postive']
    neg = ['negative', 'very negative']
    for label in labels:
        label = label.lower()
        if label in pos:
            label = "positive"
        elif label in neg:
            label = "negative"
        new_labels.append(label)
    return pd.Series(new_labels)

def stringify_reviews(reviews):
    new_reviews = []
    for review in reviews:
        new_reviews.append(str(review))
    return new_reviews

## SENTIMENT

In [21]:
labels = collapse_labels(norm_l['sentiment'])
reviews = stringify_reviews(norm_l['review'])

In [22]:
from sklearn.pipeline import Pipeline
import numpy as np

tfidf_vectorizer = TfidfVectorizer(max_features=5000)
count_vectorizer = CountVectorizer(min_df=25, ngram_range=(1,2))

def train_clfs(reviews, labels, mf=5000):
    """
    Encode target labels with value between 0 and n_classes-1.
    """
    encoder = LabelEncoder()
    encoded_labels = encoder.fit_transform(labels)
    
    """
    Split training and test data (70/30)
    """
    X_train, X_test, y_train, y_test = model_selection.train_test_split(
        reviews,
        encoded_labels,
        random_state=8,
        train_size=0.7
    )
    
    """
    TF-IDF
    """
    tfidf_vectorizer.fit(reviews) # on the entire vocabulary
    
    X_train_tfidf = tfidf_vectorizer.transform(X_train)
    X_test_tfidf = tfidf_vectorizer.transform(X_test)
    
    """
    Models
    """
    nb = naive_bayes.MultinomialNB().fit(X_train_tfidf, y_train)
    nb_y_preds = nb.predict(X_test_tfidf)
    print_performance("multinomial naive bayes",
                     y_test, nb_y_preds)
    
    SVM = svm.SVC(C=1.0, kernel='linear', gamma='auto').fit(X_train_tfidf, y_train)
    SVM_y_preds = SVM.predict(X_test_tfidf)
    print_performance("svm", y_test, SVM_y_preds)
        
    count_vectorizer.fit(X_train)
    X_train_vect = count_vectorizer.transform(X_train)
    X_test_vect = count_vectorizer.transform(X_test)
    
    lr = LogisticRegression(solver='liblinear', multi_class='auto').fit(X_train_vect, y_train)
    lr_y_preds = lr.predict(X_test_vect)
    print_performance("logistic regression", y_test, lr_y_preds)
    
    rf = RandomForestClassifier(max_depth=50).fit(X_train_tfidf, y_train)
    X_vect_test_tfidf = tfidf_vectorizer.transform(X_test)
    rf_y_preds = rf.predict(X_vect_test_tfidf)
    print_performance("random forest", y_test, rf_y_preds)
    
    sgd = SGDClassifier(loss='hinge', penalty='l2',
                             alpha=1e-3, random_state=42,
                             max_iter=5, tol=None).fit(X_train_tfidf, y_train)
    sgd_y_preds = sgd.predict(X_vect_test_tfidf)
    print_performance("sgd", y_test, sgd_y_preds)
    
def print_performance(title, test, predicted):
    print(f'/\/\/\/\/ {title}')
    print("Recall\t\t",recall_score(test, predicted, average="weighted")*100)
    print()
    print("Precision\t", precision_score(test, predicted, average='weighted')*100)
    print()
    print("F1\t\t", f1_score(test, predicted, average='weighted')*100, "\n")
    

text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier(loss='hinge', penalty='l2',
                         alpha=1e-3, random_state=42,
                         max_iter=8, tol=None)),
])
    
def train_clfs2(reviews, labels, mf=5000):
    encoder = LabelEncoder()
    e_labels = encoder.fit_transform(labels)
    
    X_train, X_test, y_train, y_test = model_selection.train_test_split(reviews, e_labels,
                                                                        random_state=42, train_size=0.7)
    
    text_clf.fit(X_train, y_train)
    
    y_pred = text_clf.predict(X_test)
    print(np.mean(y_pred == y_test))
#     print(metrics.classification_report(y_test, y_pred, target_names=['positive', 'neutral', 'negative']))

In [23]:
def get_tfidf(reviews, X_train):
    tfidf_vectorizer.fit(reviews)
    X_train_tfidf = tfidf_vectorizer.transform(X_train)
    return X_train_tfidf
    
def get_count_vect(X_train, X_test):
    count_vectorizer.fit(X_train)
    X_train_vect = count_vectorizer.transform(X_train)
    return X_train_vect

In [24]:
train_clfs2(reviews, labels)

0.6567342073897497


In [25]:
# 3-class classification
labels.unique()

array(['neutral', 'negative', 'positive'], dtype=object)

In [26]:
train_clfs(reviews, labels)

/\/\/\/\/ multinomial naive bayes
Recall		 66.15017878426698

Precision	 66.70088015702542

F1		 64.49623885060626 

/\/\/\/\/ svm
Recall		 67.34207389749702

Precision	 67.40061457203662

F1		 67.36049779767703 

/\/\/\/\/ logistic regression
Recall		 61.14421930870083

Precision	 60.73714073131784

F1		 60.877959094766396 

/\/\/\/\/ random forest
Recall		 65.55423122765197

Precision	 64.96805163133234

F1		 64.45319970907914 

/\/\/\/\/ sgd
Recall		 67.46126340882003

Precision	 66.68513014297989

F1		 66.74201356098311 



In [52]:
# add extra features blindly
lf = l.copy()
lf.head(1)

Unnamed: 0,country,score,topic,review,category,sentiment
0,United States,1,Notifications not showing up,The notification badges are showing up on my i...,requirement,neutral


In [139]:
def add_features(df, list_of_features):
    enriched_reviews = []
    # go through each review in the dataframe
    
    for index, row in df.iterrows():
        features = []
        
        # go through each of the provided features
        for f in list_of_features:
            features.append(row[f])
        
        enriched_review = ""
        for feature in features:
            enriched_review += str(feature) + " "
        enriched_review += row['review']
        enriched_reviews.append(enriched_review)
    return enriched_reviews

In [53]:
lf['review'] = add_features(lf, ['country', 'topic', 'score'])
lf['review'] = normalize(lf, 'review')
lf['review'] = stringify_reviews(lf['review'])

f_reviews = lf['review']

In [54]:
train_clfs2(f_reviews, labels)

0.6734207389749702


In [55]:
train_clfs(f_reviews, labels)

/\/\/\/\/ multinomial naive bayes
Recall		 67.34207389749702

Precision	 67.09115623856893

F1		 65.60951060580464 

/\/\/\/\/ svm
Recall		 68.89153754469606

Precision	 69.14623635671052

F1		 69.00342921910872 

/\/\/\/\/ logistic regression
Recall		 63.17044100119189

Precision	 62.88423495372807

F1		 62.9456187191635 

/\/\/\/\/ random forest
Recall		 67.22288438617402

Precision	 66.50084209083018

F1		 66.59892220082924 

/\/\/\/\/ sgd
Recall		 70.20262216924911

Precision	 69.5406349130178

F1		 69.6127972478808 



# HYPERPARAM TUNING

In [56]:
from sklearn.model_selection import GridSearchCV
from sklearn import metrics

parameters = {
    'vect__ngram_range': [(1,1), (1, 2)],
    'tfidf__use_idf': (True, False),
    'clf__alpha': (1e-2, 1e-3),
}

gs_clf = GridSearchCV(text_clf, parameters, cv=5, n_jobs=1)

encoder = LabelEncoder()
e_labels = encoder.fit_transform(labels)

X_train, X_test, y_train, y_test = model_selection.train_test_split(f_reviews, e_labels,
                                                                    random_state=42, train_size=0.6)

gs_clf = gs_clf.fit(X_train, y_train)
gs_clf.best_score_

In [60]:
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))

clf__alpha: 0.001
tfidf__use_idf: True
vect__ngram_range: (1, 1)


# CROSS VALIDATION

In [61]:
scores = cross_val_score(text_clf, f_reviews, labels, cv=5)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

# CONFUSION MATRIX

In [38]:
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_predict
y_train_pred = cross_val_predict(text_clf, X_train, y_train, cv=5)
confusion_matrix(y_train, y_train_pred)

array([[407, 109,  67],
       [171, 241,  97],
       [ 65,  45, 475]])

In [82]:
la['review'][0] # original

TypeError: tuple indices must be integers or slices, not str

In [68]:
str(norm_l['review'][0]) # normalized

"['notification', 'badge', 'show', 'iphone', 'plus', 'open', 'app', 'happen', 'since', 'last', 'update', 'also', 'quit', 'interrupt', 'videos', 'stupid', 'ads', 'annoy']"

In [66]:
str(lf['review'][0]) # normalized, with additional features

"['unite', 'state', 'notifications', 'show', 'notification', 'badge', 'show', 'iphone', 'plus', 'open', 'app', 'happen', 'since', 'last', 'update', 'also', 'quit', 'interrupt', 'videos', 'stupid', 'ads', 'annoy']"

# LANGUAGE IDENTIFICATION

In [72]:
from pathlib import Path
import random
from collections import Counter, defaultdict
from sklearn.neighbors import *
from matplotlib import pyplot as plt
from mpl_toolkits import mplot3d
%matplotlib inline
 
def read(file):
    '''Returns contents of a file'''
    with open(file, 'r', errors='ignore') as f:
        text = f.read()
    return text
 
def load_eu_texts():
    '''Read texts snipplets in 10 different languages into pd.Dataframe
 
    load_eu_texts() -> pd.Dataframe
    
    The text snipplets are taken from the nltk-data corpus.
    '''
    basepath = Path('/Users/gohost/nltk_data/corpora/europarl_raw/')
    df = pd.DataFrame(columns=['text', 'lang', 'len'])
    languages = [None]
    for lang in basepath.iterdir():
        languages.append(lang.as_posix())
        t = '\n'.join([read(p) for p in lang.glob('*')])
        d = pd.DataFrame()
        d['text'] = ''
        d['text'] = pd.Series(t.split('\n'))
        d['lang'] = lang.name.title()
        df = df.append(d.copy(), ignore_index=True)
    return df
 
def clean_eutextdf(df):
    '''Preprocesses the texts by doing a set of cleaning steps
    
    clean_eutextdf(df) -> cleaned_df
    '''
    # Cuts of whitespaces a the beginning and and
    df['text'] = [i.strip() for i in df['text']]
    # Generate a lowercase Version of the text column
    df['ltext'] = [i.lower() for i in df['text']]
 
    # Determining the length of each text
    df['len'] = [len(i) for i in df['text']]
    # Drops all texts that are not at least 200 chars long
    df = df.loc[df['len'] > 200]
    return df
 
# Execute the above functions to load the texts
df = clean_eutextdf(load_eu_texts())
 
# Print a few stats of the read texts
textline = 'Number of text snippplets: ' + str(df.shape[0])
print('\n' + textline + '\n' + ''.join(['_' for i in range(len(textline))]))
c = Counter(df['lang'])
for l in c.most_common():
    print('%-25s' % l[0] + str(l[1]))
df.sample(10)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort,



Number of text snippplets: 63156
________________________________
Greek                    6681
French                   6466
German                   6395
Italian                  6383
Portuguese               6147
Spanish                  6016
Finnish                  5597
Swedish                  4940
Danish                   4914
Dutch                    4826
English                  4791


Unnamed: 0,lang,len,text,ltext
178620,French,237,Je demanderai conseil aux services juridiques ...,je demanderai conseil aux services juridiques ...
149602,Danish,317,Hvis vi skal have held til at overbevise en sk...,hvis vi skal have held til at overbevise en sk...
120874,Portuguese,211,A prova disso é o facto de que nas regiões ond...,a prova disso é o facto de que nas regiões ond...
90450,Greek,247,Λύσεις σε αυτά τα προβλήματα δεν αναζητούμε εξ...,λύσεις σε αυτά τα προβλήματα δεν αναζητούμε εξ...
133398,Finnish,224,"Lisäksi täytyy olla niin , että jos tilintarka...","lisäksi täytyy olla niin , että jos tilintarka..."
27642,German,270,Wenn Europa in eine Phase des Nachdenkens über...,wenn europa in eine phase des nachdenkens über...
89838,Greek,256,"Ωστόσο , η γαλλική αντιπροσωπεία της Ομάδας μα...","ωστόσο , η γαλλική αντιπροσωπεία της ομάδας μα..."
181089,French,319,Cette clarification fait défaut dans le texte ...,cette clarification fait défaut dans le texte ...
173490,French,355,Mais dès le Sommet suivi tenu au mois de décem...,mais dès le sommet suivi tenu au mois de décem...
14899,Dutch,224,Het zou ronduit beschamend zijn als we zouden ...,het zou ronduit beschamend zijn als we zouden ...


In [73]:
def calc_charratios(df):
    '''Calculating ratio of any (alphabetical) char in any text of df for each lyric
    
    calc_charratios(df) -> list, pd.Dataframe
    '''
    CHARS = ''.join({c for c in ''.join(df['ltext']) if c.isalpha()})
    print('Counting Chars:')
    for c in CHARS:
        print(c, end=' ')
        df[c] = [r.count(c) for r in df['ltext']] / df['len']
    return list(CHARS), df
 
features, df = calc_charratios(df)

Counting Chars:
β è a m f ü d é τ ç γ κ ò v ô υ δ α e ό ύ û ϊ ë q χ j ρ ά x b ï h ì ΐ l ä ώ ú η ö ή ξ t õ î g º í θ k r z w æ â ê ι i o c ω å ß à ã ν μ π ς ª á ù έ y ί s n φ ó ϋ ñ λ u ε ΰ σ p ψ ο ζ ø 

In [74]:
def split_dataset(df, ratio=0.5):
    '''Split the dataset into a train and a test dataset
    
    split_dataset(featuredf, ratio) -> pd.Dataframe, pd.Dataframe
    '''
    df = df.sample(frac=1).reset_index(drop=True)
    traindf = df[:][:int(df.shape[0] * ratio)]
    testdf = df[:][int(df.shape[0] * ratio):]
    return traindf, testdf
 
featuredf = pd.DataFrame()
featuredf['lang'] = df['lang']
for feature in features:
    featuredf[feature] = df[feature]
traindf, testdf = split_dataset(featuredf, ratio=0.80)
 
x = np.array([np.array(row[1:]) for index, row in traindf.iterrows()])
y = np.array([l for l in traindf['lang']])
X = np.array([np.array(row[1:]) for index, row in testdf.iterrows()])
Y = np.array([l for l in testdf['lang']])

In [75]:
def train_knn(x, y, k):
    '''Returns the trained k nearest neighbors classifier
    
    train_knn(x, y, k) -> sklearn.neighbors.KNeighborsClassifier
    '''
    clf = KNeighborsClassifier(k)
    clf.fit(x, y)
    return clf
 
def test_knn(clf, X, Y):
    '''Tests a given classifier with a testset and return result
    
    text_knn(clf, X, Y) -> float
    '''
    predictions = clf.predict(X)
    ratio_correct = len([i for i in range(len(Y)) if Y[i] == predictions[i]]) / len(Y)
    return ratio_correct
 
print('''k\tPercentage of correctly predicted language
__________________________________________________''')
for i in range(1, 16):
    clf = train_knn(x, y, i)
    ratio_correct = test_knn(clf, X, Y)
    print(str(i) + '\t' + str(round(ratio_correct * 100, 3)) + '%')

k	Percentage of correctly predicted language
__________________________________________________
1	98.06%
2	98.053%
3	98.369%
4	98.433%
5	98.599%
6	98.567%
7	98.528%
8	98.599%
9	98.615%
10	98.623%
11	98.607%
12	98.686%
13	98.615%
14	98.694%
15	98.615%


In [76]:
def extract_features(text, features):
    '''Extracts all alphabetic characters and add their ratios as feature
    
    extract_features(text, features) -> np.array
    '''
    textlen = len(text)
    ratios = []
    text = text.lower()
    for feature in features:
        ratios.append(text.count(feature) / textlen)
    return np.array(ratios)
 
def predict_lang(text, clf=clf):
    '''Predicts the language of a given text and classifier
    
    predict_lang(text, clf) -> str
    '''
    extracted_features = extract_features(text, features)
    return clf.predict(np.array(np.array([extracted_features])))[0]
 
text_sample = df.sample(10)['text']
 
for example_text in text_sample:
    print('%-20s'  % predict_lang(example_text, clf) + '\t' + example_text[:60] + '...')

Danish              	Vi hilser det portugisiske formandskabs initiativ velkomment...
Greek               	Για τον λόγο αυτό ταχθήκαμε κατά της συγκέντρωσης πόρων σε μ...
German              	Grenzüberschreitende Dienstleistungen Nach der Tagesordnung ...
Italian             	Nel novembre 1991 , dopo un lungo dibattito all' interno del...
Danish              	Purvis og jeg tænker på , de maltwhishyproducerende dele af ...
Greek               	Θα μπορούσε να αναρωτηθεί κανείς πώς χορηγείται στη σημερινή...
Finnish             	Ei voida myöskään olla välittämättä siitä , mikä merkitys äs...
French              	Ce qui s' est passé avec l' Erika n' est pas le fruit de la ...
Italian             	Tengo a dire che anche la partecipazione del Parlamento euro...
Finnish             	Monissa maissa ihmisoikeuksien yleismaailmallisuuden periaat...


# PREDICTING LANGUAGE

In [111]:
len(l['review'])

2795

In [109]:
english_only = []
reviews = l['review']
for r in reviews:
    if predict_lang(r, clf) == "English":
        english_only.append(r)

In [145]:
def drop_non_english(df):
    english_only = []
    
    for index, row in df.iterrows():
        if predict_lang(row['review'], clf) != "English":
            df.drop(index, inplace=True)
    return df

In [110]:
len(english_only)

2328

In [140]:
english_only[:2]

['The notification badges are showing up on my iPhone 6 Plus but when I open app there not there. This has happened since last update. \nAlso quit interrupting videos with stupid ads. Annoying.',
 'Why do they make changes we don\'t need? Now there is no option to see just "recent posts"! Looks like I\'m going to have to delete the phone app. Way to lose people FB.']

### REMOVING NON-ENGLISH REVIEWS

In [151]:
lfe = l.copy()
lfe.describe()

Unnamed: 0,score
count,2795.0
mean,2.964222
std,1.400279
min,1.0
25%,2.0
50%,3.0
75%,4.0
max,5.0


In [152]:
lfe = drop_non_english(lfe)
len(lfe)

2328

In [153]:
lfe['review'] = add_features(lfe, ['country', 'topic', 'score'])
lfe_reviews = normalize(lfe, 'review')
lfe_reviews = stringify_reviews(lfe_reviews)

In [154]:
lfe_labels = collapse_labels(lfe['sentiment'])
lfe_labels.unique()

array(['neutral', 'negative', 'positive'], dtype=object)

In [155]:
train_clfs(lfe_reviews, lfe_labels)

/\/\/\/\/ multinomial naive bayes
Recall		 61.37339055793991

Precision	 61.13806590320255

F1		 56.2467626611372 

/\/\/\/\/ svm
Recall		 63.66237482117311

Precision	 62.564591581873906

F1		 62.79706442857448 

/\/\/\/\/ logistic regression
Recall		 61.659513590844064

Precision	 60.56307194159024

F1		 60.863130669804676 

/\/\/\/\/ random forest
Recall		 64.52074391988555

Precision	 63.864895596212314

F1		 62.85485758006385 

/\/\/\/\/ sgd
Recall		 64.09155937052932

Precision	 62.71809797800538

F1		 62.48638713930551 

