In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
import gensim
from gensim.models import LdaModel
from gensim.corpora import Dictionary
import pyLDAvis.gensim

from snippets import (getTopWordsByCategory, 
                      plotTopWordsByCategory, plotTopWords, 
                      plotClassShares, printSampleComments,
                      plotSetIntersections, calculateUncertanityCoeff,
                      plotUncertanityCoeff)

from features_engineering import (calculateTFIDFscore, split_words)
from nb_svm import fitModel, NbSvmClassifier
from load_data import loadData

from config import DATA_FILE, LABELS, CONTENT, UNIQUE_ID, TEST_SIZE, C, NUM_TOPICS

In [2]:
df = loadData(DATA_FILE)

In [None]:
df.head()

In [None]:
df.dtypes

In [None]:
df.isnull().sum()

In [None]:
df[CONTENT].fillna("unknown", inplace=True)

for col in LABELS:
    print("Labels for %s are: %s" % (col, ', '.join(map(str, df[col].unique()))))

In [None]:
INNOCENT_LABEL = "innocent"
df[INNOCENT_LABEL] = (df[LABELS].sum(axis = 1) == 0).astype(int)
ALL_LABELS = LABELS + [INNOCENT_LABEL]

In [None]:
# Sample comments
printSampleComments(df, CONTENT, LABELS, 3)

In [None]:
# Analyse share of each label
ratio = df[LABELS].mean()
plotClassShares(LABELS, ratio)

In [None]:
# Multiple labels
print("There are %s comments, but %s comment labels." % (df.shape[0], int(df.shape[0]*ratio.sum())))

df_subset = df.loc[df[INNOCENT_LABEL] == 0]    
plotSetIntersections(df_subset, LABELS, UNIQUE_ID)

In [None]:
# Theil's U uncertanity coefficient 
# https://towardsdatascience.com/the-search-for-categorical-correlation-a1cf7f1888c9

uncertanity_coeff = calculateUncertanityCoeff(df, LABELS)
plotUncertanityCoeff(uncertanity_coeff, LABELS)

In [None]:
# Calculate TF-IDF (Term Frequency - Inverse Document Frequency) 
word_counts, features = calculateTFIDFscore(df[CONTENT])

In [None]:
# Get most common words
NUM_WORDS_CAT = 10
NUM_WORDS_ALL = 20
words, counts = getTopWordsByCategory(df, LABELS, word_counts, features, NUM_WORDS_CAT)
words_all, counts_all = getTopWordsByCategory(df, LABELS, word_counts, features, NUM_WORDS_ALL, aggregate = True)

In [None]:
plotTopWords(words_all[0], counts_all[0], "All")

In [None]:
plotTopWordsByCategory(words, counts, LABELS)