In [61]:
import nltk
import pandas as pd
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.cross_validation import KFold, cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn import metrics 
import matplotlib.pyplot as plt
from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import permutation_test_score
import re
import random

In [12]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /home/fatma/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [1]:
def clean_str(string):
    """
    Tokenization/string cleaning for all datasets except for SST.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip().lower()

In [2]:
def remove_stop(str):
    stop = set(stopwords.words('english'))
    lst = str.split(" ")
    lst = [i for i in lst if i not in stop]
    return ' '.join(lst)

def remove_unwanted_words(str):
    unwanted_words = ["httpaddress", "usrid", "dd", "rt", "amp", "pm", " ", "'s", "n't", "\t", '``', "''", "", "//", "\\", "\\'s", "\\?", "\?"]
    lst = str.split(" ")
    lst = [i for i in lst if i not in unwanted_words]
    return ' '.join(lst)

def toLower(str):
    lst = str.split(" ")
    lst = [i.lower() for i in lst]
    return ' '.join(lst)

In [42]:
def load_data_and_labels_shuffled(positive_data_file, negative_data_file):
    """
    Loads MR polarity african_data from files, splits the african_data into words and generates labels.
    Returns split sentences and labels.
    """
    # Load african_data from files
    positive_examples = list(open(positive_data_file, "r").readlines())
    positive_examples = [s.strip() for s in positive_examples]
    positive_examples = [remove_stop(item) for item in positive_examples]
    positive_examples = [toLower(item) for item in positive_examples]
    positive_examples = [remove_unwanted_words(item) for item in positive_examples]
    positive_examples = [clean_str(sent) for sent in positive_examples]
    for i in positive_examples:
        if len(i.split(" ")) < 3:
            positive_examples.remove(i)
    positive_examples = list(filter(None, positive_examples))
    negative_examples = list(open(negative_data_file, "r").readlines())
    negative_examples = [s.strip() for s in negative_examples]
    negative_examples = [toLower(item) for item in negative_examples]
    negative_examples = [remove_stop(item) for item in negative_examples]
    negative_examples = [remove_unwanted_words(item) for item in negative_examples]
    negative_examples = [clean_str(sent) for sent in negative_examples]
    for i in negative_examples:
        if len(i.split(" ")) < 3:
            negative_examples.remove(i)

    negative_examples = list(filter(None, negative_examples))

    # Split by words
    x_text = positive_examples + negative_examples
    # x_text = [clean_str(sent) for sent in x_text]
    # Generate labels
    positive_labels = [1 for _ in positive_examples]
    negative_labels = [0 for _ in negative_examples]
    y = np.concatenate([positive_labels, negative_labels], 0)

    random.Random(10).shuffle(x_text)
    random.Random(10).shuffle(y)

    return [x_text, y]

In [43]:
def k_fold_cross_validation(pipeline, df,label, n_folds):
    k_fold = KFold(n=len(df), n_folds=n_folds)
    F1_scores = []
    P_scores = []
    R_scores = []
    confusion = np.array([[0, 0], [0, 0]])
    for train_indices, test_indices in k_fold:
        train_text = df.iloc[train_indices]['text'].values
        train_y = df.iloc[train_indices][label].values

        test_text = df.iloc[test_indices]['text'].values
        test_y = df.iloc[test_indices][label].values

        pipeline.fit(train_text, train_y)
        predictions = pipeline.predict(test_text)

        confusion += confusion_matrix(test_y, predictions)
        F1_score = f1_score(test_y, predictions, pos_label=1)
        P_score = precision_score(test_y, predictions, pos_label=1)
        R_score = recall_score(test_y, predictions, pos_label=1)
        F1_scores.append(F1_score)
        P_scores.append(P_score)
        R_scores.append(R_score)

    print('Total emails classified:', len(df))
    print('F1 Score:', sum(F1_scores)/len(F1_scores))
    print('P Score:', sum(P_scores)/len(P_scores))
    print('R Score:', sum(R_scores)/len(R_scores))
    print('Confusion matrix:')
    print(confusion)

In [97]:
def perf_measure(y_actual, y_hat):
    TP = 0
    FP = 0
    TN = 0
    FN = 0

    for i in range(len(y_hat)): 
        if y_actual[i]==y_hat[i]==1:
           TP += 1
        if y_hat[i]==1 and y_actual[i]!=y_hat[i]:
           FP += 1
        if y_actual[i]==y_hat[i]==0:
           TN += 1
        if y_hat[i]==0 and y_actual[i]!=y_hat[i]:
           FN += 1
        
    return TP, FP,TN, FN 

In [44]:
text, y = load_data_and_labels_shuffled('Data/turkish_protest_test_pos_prccd2.txt', 'Data/turkish_protest_test_neg_prccd2.txt')

In [47]:
df = pd.DataFrame({'text': list(text),'label': y})

In [48]:
df_train = df[:370]
df_test = df[371:]

In [96]:
print("positive exampes", len(df_train[df_train["label"] == 1]))
print("negative exampes", len(df_train[df_train["label"] == 0]))

positive exampes 43
negative exampes 327


In [87]:
pipeline_violence_Svc = Pipeline([
    ('vectorizer',  CountVectorizer(ngram_range=(1, 2),stop_words='english',  min_df=3)),
    ('tfidf_transformer',  TfidfTransformer()),
    ('classifier',  SVC(kernel='linear', gamma=2)) ])

k_fold_cross_validation(pipeline_violence_Svc, df_train, 'label',10)

Total emails classified: 370
F1 Score: 0.175
P Score: 0.35
R Score: 0.129523809524
Confusion matrix:
[[322   5]
 [ 37   6]]


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [88]:
#predicted scores
scores = cross_val_score(pipeline_violence_Svc, df_test.text, df_test.label, cv =15)
print(scores.mean())
print(scores.std())

0.814920634921
0.12403967213




In [89]:
y_predict = pipeline_violence_Svc.fit(df_train.text, df_train.label).decision_function(df_test.text)

In [90]:
prediction = pipeline_violence_Svc.predict(df_test.text)

In [91]:
prediction

array([0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [92]:
roc_auc = metrics.roc_auc_score(df_test.label, prediction)
print(roc_auc)

0.539281705948


In [93]:
acc = metrics.accuracy_score(df_test.label, prediction)
print(acc)

0.880434782609


In [99]:
TP, FP, TN, FN = perf_measure(df_test.label.values, prediction)

In [100]:
TP, FP, TN, FN

(1, 1, 80, 10)