In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
from IPython import get_ipython
import collections
from datetime import datetime
import numpy as np
from model import RFModel

with open('lib/data/my_clean_data_training.csv') as f:
    my_data_training = pd.read_csv(f, sep=',')
print("Clean Dataset Shape for Training: ", my_data_training.shape)

with open('lib/data/my_clean_data_normies.csv') as f:
    my_data_normies = pd.read_csv(f, sep=',')
print("Clean Dataset Shape for Normies: ", my_data_normies.shape)
my_data_normies['target'] = 'normal'

my_data = my_data_training.append(my_data_normies)
print("Clean Dataset Shape Combined: ", my_data.shape)

In [None]:
# drop duplicates
my_data = my_data.drop_duplicates(subset=['author','link_id','created_utc'])

# correct labeling
my_data.loc[my_data.author == 'PoliticsModeratorBot','target'] = 'bot'

# Label known bots in normies
bot_authors = my_data[my_data.target == 'bot'].author.unique()
my_data.loc[((my_data.target == 'normal') & (my_data.author.isin(bot_authors))),'target'] = 'bot'

In [None]:
# Delete irrelevant columns
columns = ['link_id', 'author', 'created_utc', 'body']
my_data.drop(columns, inplace=True, axis=1)
print("After removing columns not considered: ", my_data.shape)

In [None]:
my_data[my_data['target']=='normal'].describe()

In [None]:
my_data[my_data['target']=='bot'].describe()

In [None]:
my_data[my_data['target']=='troll'].describe()

In [None]:
# Set fractions between the user classes
#print("\nFixing ratios between classes")
#dataset = my_data[my_data.target==2]
#dataset = dataset.append(my_data[my_data.target==1].sample(n=len(dataset)*2))
#dataset = dataset.append(my_data[my_data.target==0])
#my_data = dataset

In [None]:
# Number of targets
targets = collections.Counter(my_data['target'])
print(targets)

# Extract feature and target np arrays (inputs for placeholders)
input_y = my_data['target'].values
input_x = my_data.drop(['target'], axis=1)

# Splitting the dataset into the Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(
            input_x, input_y,
            test_size=0.3, random_state=16)

In [None]:
# Create a Decision Tree Classifier
clf = DecisionTreeClassifier(max_depth=3, 
                             class_weight={'normal':1, 'bot':2.5, 'troll':5}, 
                             min_samples_leaf=100)

# Train the model using the training sets y_pred=clf.predict(X_test)
clf.fit(X_train, y_train)

# prediction on test set
y_pred = clf.predict(X_test)

# Model Accuracy, how often is the classifier correct?
y_true = y_test

matrix = pd.crosstab(y_true, y_pred, rownames=['True'],
                     colnames=['Predicted'], margins=True)
print(matrix)
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))
print("Mcc:", metrics.matthews_corrcoef(y_test, y_pred))
print("F1 :", metrics.f1_score(y_test, y_pred, average=None))
print("Recall :", metrics.recall_score(y_test, y_pred, average=None))
print("Precision:", metrics.precision_score(y_test, y_pred, average=None))

feature_imp = pd.Series(
        clf.feature_importances_,
        index=my_data.columns.drop('target')).sort_values(ascending=False)
print(feature_imp)

In [None]:
# prediction on training set
y_pred = clf.predict(X_train)

# Model Accuracy, how often is the classifier correct?
y_true = y_train

matrix = pd.crosstab(y_true, y_pred, rownames=['True'],
                     colnames=['Predicted'], margins=True)
print(matrix)

In [None]:
estimator = clf

from sklearn.tree import export_graphviz
# Export as dot file
export_graphviz(estimator, out_file='tree.dot', 
                feature_names = data.drop(['target'], axis=1).columns.values,
                class_names = np.array(['bot','normal','troll']),
                rounded = False, proportion = False, 
                precision = 5, filled = True)

# Convert to png using system command (requires Graphviz)
from subprocess import call
call(['dot', '-Tpng', 'tree.dot', '-o', 'tree.png', '-Gdpi=600'])

# Display in jupyter notebook
from IPython.display import Image
Image(filename = 'tree.png')

In [None]:
# Build the model and pickle it for use by the API 
model = RFModel()

# Create a Gaussian Classifier
model.create(3)

# Train the model using the training sets y_pred=clf.predict(X_test)
model.train(X_train, y_train)
y_pred = model.predict(X_test)
y_true = y_test
matrix = pd.crosstab(y_true, y_pred, rownames=['True'],
                     colnames=['Predicted'], margins=True)
print(matrix)
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))
print("Mcc:", metrics.matthews_corrcoef(y_test, y_pred))
print("F1 :", metrics.f1_score(y_test, y_pred, average=None))
print("Recall :", metrics.recall_score(y_test, y_pred, average=None))
print("Precision:", metrics.precision_score(y_test, y_pred, average=None))

feature_imp = pd.Series(
        model.feature_importances(),
        index=my_data.columns.drop('target')).sort_values(ascending=False)
feature_imp

model.pickle_clf()
model.pickle_clean_data()

In [None]:
# Plot number of targets
from IPython import get_ipython
import matplotlib.pyplot as plt

ipy = get_ipython()
if ipy is not None:
    ipy.run_line_magic('matplotlib', 'inline')

    # Creating a bar plot
    sns.set(style="darkgrid")
    sns.countplot(x="target", data=my_data)

    # Add labels to your graph
    plt.xlabel('Target Score')
    plt.ylabel('Targets')
    plt.title("Targets Distribution")
    plt.show()

In [None]:
# Visualize the feature importance
ipy = get_ipython()
if ipy is not None:
    ipy.run_line_magic('matplotlib', 'inline')
    
    # Creating a bar plot
    sns.barplot(x=feature_imp, y=feature_imp.index)
    
    # Add labels to your graph
    plt.xlabel('Feature Importance Score')
    plt.ylabel('Features')
    plt.title("Visualizing Important Features")
    plt.show()