In [None]:
import yaml
import pandas as pd
import numpy as np
import seaborn as sns

from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC

from tools import print_interesting_words
from tools import display_roc_curve

Read configuration from yml file

In [None]:
cfg = yaml.safe_load(open('nlp_heb.yml'))
fn = cfg['input']
MODEL_FILENAME = cfg['output']

More configuration

In [None]:
np.random.seed(cfg['RANDOM_SEED'])

In [None]:
NUMBER_WORDS_TO_REVIEW = 15
WORD_TO_TEST = 'washington'
CLOSEST_COUNT = 10
NUMBER_OF_CLUSTERS = 15
num_words = 2000

In [None]:
print('Reading data from: ',fn)

In [None]:
data = pd.read_csv(fn) #, quoting = 3)

Review data

In [None]:
print(f'we have {len(data)} lines of data')
data = data.sample(frac=1)

In [None]:
data.head()

# Cleanup and features engineering: 
    lowercase
    remove digits
    parse annotation
    remove redundencies
remove empty columns

In [None]:
data['content'] = data['prof_text'].str.lower() + data['prof_improve'].str.lower()
data = data.dropna(subset=['content','prof_sat'])

We ingnore review of '4', (we compare the '5' to '1-3')

In [None]:
data = data[data.prof_sat != 4]
data['annotation']=data['prof_sat'].apply(lambda x: int(x/5))
data.drop_duplicates(subset = "content", inplace=True)

In [None]:
sns.countplot(x='annotation', data=data)

In [None]:
negative_reviews = len(data[data['annotation']==0])
print("negative reviews:",negative_reviews)
print("good reviews:",len(data[data['annotation']==1]))

We train on a balanced set

In [None]:
data = pd.concat([data[data['annotation']==0] , data[data['annotation']==1][:negative_reviews]])
sns.countplot(x='annotation', data=data)

In [None]:
print(f'We now have {len(data)} data points')
data.head()

In [None]:
lens = data.content.str.len()
print(f'Mean message length: {lens.mean():10.1f}')
lens.hist()

# TFiDF + SVM

In [None]:
categories = ['negative' , 'positive']
pd.options.mode.chained_assignment = None

In [None]:
X = data['content'].values
Y = data['annotation'].values

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, Y,
                                                    test_size = cfg['TEST_FRACTION'],
                                                    random_state = cfg['RANDOM_SEED'], stratify=Y)

In [None]:
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', LinearSVC()),
])

In [None]:
pipeline.fit(X_train, y_train)

# Evaluate #

In [None]:
predicted = pipeline.predict(X_test)
print(metrics.classification_report(y_test, predicted,
                                    target_names=categories))
print(metrics.confusion_matrix(y_test, predicted))
disp = metrics.plot_confusion_matrix(pipeline , X_test, y_test, normalize='true')
disp.ax_.set_title("Normalized confusion matrix")

In [None]:
print_interesting_words(pipeline)

# Demotter reviews mistaken classifed as promoter

In [None]:
xs = [x for i,x in enumerate(X_test) if y_test[i]==0 and predicted[i]==1 ]
for i,x in enumerate(xs):
    print(i,x)

# Promoter reviews mistaken classifed as demotter

In [None]:
xs = [x for i,x in enumerate(X_test) if y_test[i]==1 and predicted[i]==0 ]
for i,x in enumerate(xs):
    print(i,x)

# ROC curve 

In [None]:
display_roc_curve(pipeline, X_test, y_test)

# References

https://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html