In [7]:
import pandas as pd
import numpy as np
import re
import json
import collections
from heapq import nlargest
import liwc
import plotly.express as px

from sklearn.feature_extraction.text import CountVectorizer

import spacy
nlp = spacy.load("en_core_web_sm")

### Tag news article with POS & LIWC

In [None]:
news2 = pd.read_csv('data/kaggle_train.csv')
news2 = news2[~news2['text'].isnull()]
news2 = news2[[ 'title', 'author', 'text', 'label']]

news2['pos'] = news2['text'].apply(lambda x: collections.Counter([ w.pos_ for w in nlp(x)]))


def tokenize(text):
    # you may want to use a smarter tokenizer
    for match in re.finditer(r'\w+', text, re.UNICODE):
        yield match.group(0)
def parseLIWC(x):
    gettysburg_tokens = tokenize(x)
    gettysburg_counts = Counter(category for token in gettysburg_tokens for category in parse(token))
    return gettysburg_counts


parse, category_names = liwc.load_token_parser('data/queryDictionary.dic')

news2['liwc_ct'] = news2['text'].apply(parseLIWC)


json_struct = json.loads(news2.to_json(orient="records"))    
df_flat = pd.io.json.json_normalize(json_struct) #use pd.io.json

df_flat= df_flat.fillna(0)
df_flat['word_count'] = df_flat.loc[:, df_flat.columns.str.startswith('pos.')].sum(axis=1)

In [485]:
#df_flat3.to_csv('data/flaten_file2.csv')

In [None]:
df_flat = pd.read_csv('data/flaten_file2.csv')

In [None]:
pos = df_flat.loc[:, (df_flat.columns.str.startswith('pos.')|df_flat.columns.str.startswith('label'))]

In [None]:
pos_grouped = pos.groupby('label').sum().T

In [None]:
# 0:10387
# 1:10374
pos_grouped[0] = pos_grouped[0]/10387
pos_grouped[1] = pos_grouped[1]/10374

In [None]:
pos_grouped['delta'] =pos_grouped[1] -pos_grouped[0]#.T
pos_grouped = pos_grouped.sort_values(by='delta',ascending=False)

In [None]:
fal = pos_grouped[[0]]
fal['label']=['False']*len(pos_grouped)
fal['pos']=pos_grouped.index
fal = fal.rename(columns={0: 'value'})
tru = pos_grouped[[1]]
tru['label']=['True']*len(pos_grouped)
tru['pos']=pos_grouped.index
tru = tru.rename(columns={1: 'value'})

In [None]:
fal_tru = pd.concat([fal, tru],axis=0)

### Viz: Part of Speech

In [None]:
import altair as alt

alt.Chart(fal_tru).mark_bar().encode(
    x='label:O',
    y='value:Q',
    color='label:N',
    column='pos:N'
)

### Viz: LIWC

In [317]:
liwc_df = df_flat.loc[:, (df_flat.columns.str.startswith('liwc')|df_flat.columns.str.startswith('label'))]

liwc_grouped = liwc_df.groupby('label').sum().T
# 0:10387
# 1:10374
liwc_grouped[0] = liwc_grouped[0]/10387
liwc_grouped[1] = liwc_grouped[1]/10374

liwc_fal = liwc_grouped[[0]]
liwc_fal['label']=['False']*len(liwc_grouped)
liwc_fal['pos']=liwc_grouped.index
liwc_fal = liwc_fal.rename(columns={0: 'value'})
liwc_tru = liwc_grouped[[1]]
liwc_tru['label']=['True']*len(liwc_grouped)
liwc_tru['pos']=liwc_grouped.index
liwc_tru = liwc_tru.rename(columns={1: 'value'})

liwc_fal_tru = pd.concat([liwc_fal, liwc_tru],axis=0)


alt.Chart(liwc_fal_tru).mark_bar().encode(
    x='label:O',
    y='value:Q',
    color='label:N',
    column='pos:N'
)

In [350]:
liwc_grouped['delta'] = liwc_grouped[1]-liwc_grouped[0]
liwc_grouped = liwc_grouped.sort_values(by='delta',ascending=False)

In [None]:
liwc_df = df_flat.loc[:, (df_flat.columns.str.startswith('liwc')|df_flat.columns.str.startswith('label'))]
liwc_grouped = liwc_df.groupby('label').sum().T
# 0:10387
# 1:10374
liwc_grouped[0] = liwc_grouped[0]/10387
liwc_grouped[1] = liwc_grouped[1]/10374
liwc_grouped_chart_d = liwc_grouped.T[['liwc_ct.Certain', 'liwc_ct.Informal', 'liwc_ct.Cause', 'liwc_ct.Ipron','liwc_ct.Adj', 'liwc_ct.Prep', 'liwc_ct.Function']]

            

### Training Model: POS

In [305]:
from sklearn.model_selection import train_test_split


In [408]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(pos[list(pos.columns)[1:]], list(pos['label']), test_size=0.2, random_state=42)

names = [
         "Naive Bayes",
         "Linear SVM",
         "Logistic Regression",
         "Random Forest",
         "Multilayer Perceptron"
        ]

classifiers = [
    MultinomialNB(),
    LinearSVC(),
    LogisticRegression(),
    RandomForestClassifier(),
    MLPClassifier()
]

parameters = [
              {'clf__alpha': (10, 1, 1e-1, 1e-2, 1e-3)},
              {'clf__C': (np.logspace(-5, 1, 5))},
              {'clf__C': (np.logspace(-5, 1, 5))},
              {'clf__max_depth': (1, 2, 5, 10, 50)},
              {'clf__alpha': (10, 1, 1e-1,1e-2, 1e-3)}
             ]
result = []
for name, classifier, params in zip(names, classifiers, parameters):
    clf_pipe = Pipeline([
        #('vect', TfidfVectorizer(stop_words='english')),
        ('clf', classifier),
    ])
    gs_clf = GridSearchCV(clf_pipe, param_grid=params,cv=5,n_jobs=3,verbose=2)
    clf = gs_clf.fit(X_train, y_train)
    score = clf.score(X_test, y_test)
    result.append([name,classifier,params,score,clf.best_params_, clf.best_score_, clf.cv_results_])
    

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  20 out of  25 | elapsed:    1.2s remaining:    0.3s
[Parallel(n_jobs=3)]: Done  25 out of  25 | elapsed:    1.4s finished
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.


Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=3)]: Done  20 out of  25 | elapsed:    1.3s remaining:    0.3s
[Parallel(n_jobs=3)]: Done  25 out of  25 | elapsed:    3.3s finished


Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  20 out of  25 | elapsed:    1.1s remaining:    0.3s
[Parallel(n_jobs=3)]: Done  25 out of  25 | elapsed:    1.4s finished

lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression

[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.


Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=3)]: Done  25 out of  25 | elapsed:   23.5s finished


Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  25 out of  25 | elapsed:  1.0min finished

Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.



### Training Model: LIWC

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(liwc_df[list(liwc_df.columns)[1:]], list(liwc_df['label']), test_size=0.2, random_state=42)

names = [
         "Naive Bayes",
         "Linear SVM",
         "Logistic Regression",
         "Random Forest",
         "Multilayer Perceptron"
        ]

classifiers = [
    MultinomialNB(),
    LinearSVC(),
    LogisticRegression(),
    RandomForestClassifier(),
    MLPClassifier()
]

parameters = [
              {'clf__alpha': (10, 1, 1e-1, 1e-2, 1e-3)},
              {'clf__C': (np.logspace(-5, 1, 5))},
              {'clf__C': (np.logspace(-5, 1, 5))},
              {'clf__max_depth': (1, 2, 5, 10, 50)},
              {'clf__alpha': (10, 1, 1e-1,1e-2, 1e-3)}
             ]
result_liwc = []
for name, classifier, params in zip(names, classifiers, parameters):
    clf_pipe = Pipeline([
        #('vect', TfidfVectorizer(stop_words='english')),
        ('clf', classifier),
    ])
    gs_clf = GridSearchCV(clf_pipe, param_grid=params,cv=5,n_jobs=3,verbose=2)
    clf = gs_clf.fit(X_train, y_train)
    score = clf.score(X_test, y_test)
    result_liwc.append([name,classifier,params,score,clf.best_params_, clf.best_score_, clf.cv_results_])
    

### Save the best model to pickle file

In [421]:
parameters = [
              {'clf__max_depth': (1, 2, 5, 10, 50)}
             ]

clf_pipe = Pipeline([
        ('clf', RandomForestClassifier()),
    ])
gs_clf = GridSearchCV(clf_pipe, param_grid=parameters,cv=5,n_jobs=3,verbose=2)
clf = gs_clf.fit(X_train, y_train)
score = clf.score(X_test, y_test)



Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  25 out of  25 | elapsed:   25.0s finished


In [423]:
model = clf.best_estimator_

In [424]:
import pickle
pickle_out = open("classifier.pkl", mode = "wb")
pickle.dump(model, pickle_out)
pickle_out.close()

# Save Model Result & Parameter Table

In [None]:
pos_result = pd.DataFrame([[mod[0]]+list(mod[6].get('mean_test_score')) for mod in result], columns=['model','p1','p2','p3','p4','p5'])

In [None]:
liwc_result = pd.DataFrame([[mod[0]]+list(mod[6].get('mean_test_score')) for mod in result_liwc], columns=['model','p1','p2','p3','p4','p5'])

In [None]:
pos_result_flatten = pd.DataFrame()
for i in range(1,6):
    p1 = pos_result[['model','p'+str(i)]]
    p1['parameter']=['p'+str(i)]*len(pos_result)
    p1 = p1.rename(columns={'p'+str(i): 'acuracy'})
    if i == 1:
        pos_result_flatten = p1
    else:
        pos_result_flatten = pd.concat([pos_result_flatten,p1], axis=0)


In [None]:
liwc_result_flatten = pd.DataFrame()
for i in range(1,6):
    p1 = liwc_result[['model','p'+str(i)]]
    p1['parameter']=['p'+str(i)]*len(liwc_result)
    p1 = p1.rename(columns={'p'+str(i): 'acuracy'})
    if i == 1:
        liwc_result_flatten = p1
    else:
        liwc_result_flatten = pd.concat([liwc_result_flatten,p1], axis=0)


In [479]:
pos_result_flatten.to_csv('data/pos_result_flatten.csv')
liwc_result_flatten.to_csv('data/liwc_result_flatten.csv')

### Transform future input

In [None]:
x = "adobochron 1 Comment SAN FRANCISCO, California ( The Adobo Chronicles, San Francisco Bureau) – Now it can be told — now that Hillary Clinton is all but assured of the Democratic presidential nomination. For months since the former secretary of state launched her presidential bid, her campaign logo has always symbolized America’s quest to move forward, with the arrow in the logo pointing to the right. Left-handed people, of course, dispute that symbolism because for them, moving forward should be pointing to the left. But there is another school of thought about Hillary’s logo. Many believe that the arrow pointing right actually symbolizes moving to the (conservative) right as opposed to the (liberal) left. The logo was a brilliant concept designed to help the Democrats win over Republican voters in the general elections — voters who will definitely choose a rightist over a leftist candidate. The GOP’s presumptive nominee, Donald Trump, is perceived by many as more leftist than rightist, which would give Clinton the edge among conservative voters. Who knew the placement or direction of an arrow could spell victory or defeat for an American presidential candidate? Rate this:"
x_parsed = parseLIWC(x)

liwc_ct = [x_parsed[name.replace("liwc_ct.","")] for name in X_train.columns]
liwc_sum = sum(liwc_ct)
liwc_ip = [ (xi*1.0)/liwc_sum for xi in liwc_ct]