In [1]:
from api.classification import run
from api.setting import Settings
from api.functions import *
from api.preprocessing import *
from joblib import Parallel, delayed

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

from pprint import pprint

In [2]:
settings = Settings()
settings.CSV_PATH = 'data/onlineHarassmentDataset.csv'
pprint(settings.__dict__)

{'AUTOCORRECT': False,
 'BALANCE_DATA': True,
 'CHECK_CONTRACTION': True,
 'CLEAN_TEXT': False,
 'CSV_PATH': 'data/onlineHarassmentDataset.csv',
 'EMOJI_TO_TEXT': True,
 'GENERATOR': 'python',
 'IGNORE_COLUMNS': ['TweetID'],
 'LOWER_TEXT': True,
 'MOST_FREQUENT_WORDS': False,
 'OVERSAMPLING': False,
 'RANDOM_SEED': 16,
 'REMOVE_LINKS': True,
 'REMOVE_NUMBERS': True,
 'REMOVE_PUNCTUATION': True,
 'REMOVE_STOPWORDS': True,
 'REMOVE_USER_TAGS': True,
 'SEPARATOR': ',',
 'STEMMING': True,
 'UNDERSAMPLING': False,
 'WORDCLOUD': False,
 'X_LABEL': 'text',
 'Y_LABEL': 'annotation'}


In [3]:
data = pd.read_csv(settings.CSV_PATH,
                   sep='\t',
                   engine='python',
                   error_bad_lines=False)
data.shape

(20360, 13)

In [None]:
data.head()

In [None]:
to_drop = [0] + (list(range(3,13)))
data.drop(data.columns[to_drop],axis = 1, inplace=True)
data.head()
data.sample(frac=1,random_state=settings.RANDOM_SEED)

In [None]:
X = data['Tweet']
Y = data['Code']

In [None]:
Y = Y.map({'N': 0, 'H': 1})
Y.value_counts()

In [None]:
X = pd.Series(Parallel(n_jobs=-1, verbose=5, backend="multiprocessing")(delayed(clean_tweet)(settings,x) for x in X))

In [None]:
dic = get_frequent_words(X)
visualize_frequent_words(dic,20)


In [None]:
vectorizer = CountVectorizer()
X_fv = vectorizer.fit_transform(X)


In [None]:
skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=settings.RANDOM_SEED)

for train_index, test_index in skf.split(X_fv,Y):
    Xtrain, Xtest = X_fv[train_index], X_fv[test_index]
    Ytrain, Ytest = Y[train_index], Y[test_index]

    # -- classification
    print(Xtrain.shape, Xtest.shape)
    print(Ytrain.shape, Ytest.shape)

    model = LogisticRegression(solver='liblinear',max_iter=100, n_jobs=-1)
    #model = BernoulliNB()
    model.fit(Xtrain, Ytrain)
    pred = model.predict(Xtest)
    print(metrics.confusion_matrix(Ytest, pred))
    print(metrics.classification_report(Ytest, pred, target_names=["none", "hate"]))
