## Planned Experiments

1. Model LR with n-gram (bow) weigted by TF-IDF and apply individual dataset seperatly
2. Apply LR with n-gram (bow) weigted by TF-IDF and apply to gossip + kaggle datasets
3. Apply LR with n-gram (bow) weigted by TF-IDF and apply to gossip + Sinhala_singlish datasets
4. Apply LR with n-gram (bow) weigted by TF-IDF and apply to gossip + Twiiter datasets

Next step -> check same LR model with word2vec or any embedding techniques

Next step -> check datasets with Deep learning model 

## Setup Experimnet settings with Neptune 

In [27]:
import neptune
from neptunecontrib.monitoring.metrics import log_binary_classification_metrics, log_classification_report
import os
from dotenv import load_dotenv

load_dotenv()
NEPTUNE_PROJECT= os.getenv('NEPTUNE_PROJECT')
NEPTUNE_API_TOKEN = os.getenv(('NEPTUNE_API_TOKEN'))
neptune.init(project_qualified_name= NEPTUNE_PROJECT,api_token=NEPTUNE_API_TOKEN) 
             

Project(codekcg23/Research-Experiments)

## Import modules

In [9]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
#from sklearn.metrics import accuracy_score, f1_score, precision_score,roc_curve,roc_auc_score,confusion_matrix,recall_score
#from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
import re

# import helper function script
import sys
sys.path.insert(1,'G:\\Github\\Sinhala-Hate-Speech-Detection')
import utills

## Read datasets to dataframes

In [5]:
df_A = pd.read_csv ('../Datasets/raw/gossip_dataset_complete_v0.csv',header =None)

df_B = pd.read_csv('../Datasets/raw/kaggle_dataset.csv')    # fb dataset -kaggle

df_C = pd.read_csv('../Datasets/raw/twitter_dataset.csv')    # multiple_domain dataset
#print(df_C.head())
df_D =  pd.read_csv('../Datasets/raw/Sinhala_Singlish_Hate_Speech.csv')   # Twitter dataset -github
#df_D.head(n=20)

## Check correctness of preprocessing functions

In [4]:
d=utills.removeEnglishWords('Ai සපෝර්ට්')
print(d)
g = utills.removeMention('@uilknk @සපෝර්ට් හොදනම්@ ')
print(g)
c= utills.removeUrl('හොදනම්  https://stackoverflow.com/questions/4987327/how-do-i-check-if-a-string-is-unicode-or-ascii සපෝර්ට්https://stackoverflow.com/questions/4987327/how-do-i-check-if-a-string-is-unicode-or-ascii')
print(c)
d = utills.removeRetweetState('RT @හොදනම් RT @jkl')
print(d)
s = utills.removePunctuation(' වැඩ?-)^  ^ හොදනම්?')
print(s)
a = utills.removeNumber('34 හොදනම්34 2හොදනම්')
print(a)
q = utills.removeEmoji(' 😫හොදනම් 🚗🚉ᓚᘏᗢ:-):-)^_^(*/ω＼*)(^///^):-Dಥ_ಥ༼ つ ◕_◕ ༽つ¯\_(ツ)_/¯(❁´◡`❁)😚😚😚😎')
print(q)

## Prepare datasets for experiments

In [6]:
# Make same columns and labels give new col representing dataset
df_A.columns = ['comment','label']  # gossip dataset
#drop columns
df_B.drop('id',axis=1,inplace=True)
df_C.drop(['instance_id','id','user.id','created_at'],axis=1,inplace=True)
df_C.columns = ['comment','label']       # change text => comment class to 'label
#print(df_C.head())
df_D.drop('PhraseNo',axis=1,inplace=True)
df_D.columns = ['comment','label']       # change Phrase => comment IsHateSpeech to 'label
#print(df_D.head())
#map column
map_dict_C = {'Neutral':0,'Sexism':1,'Racist':1}
df_C['label'].replace(map_dict_C,inplace=True)

df_D = utills.removeSentenceContainsEnglish(df_D,'comment')
map_dict_D = {'YES':1,'NO':0}
df_D['label'].replace(map_dict_D,inplace=True)

# add column to identify datasets
df_A['df'] ='A'
df_B['df'] ='B'
df_C['df'] ='C'
df_D['df'] ='D'


Input dataframe size =  2500
Cleaned dataframe size - removed Strings contain Englishs letters  1446


## Make pair of datsets

In [7]:
df_A_B = df_A.append(df_B,ignore_index=True)
df_A_C = df_A.append(df_C,ignore_index=True)
df_A_D = df_A.append(df_D,ignore_index=True)
print('A and B',len(df_A_B))
print('A and C',len(df_A_C))
print('A and D',len(df_A_D))
df_A_B.head()

A and B 12813
A and C 7879
A and D 7914


Unnamed: 0,comment,label,df
0,මේ වේසිට නීතිය ක්‍රියාත්මක වෙන්නෙ කවදාද ?,1,A
1,තූ ෙනදකින්,1,A
2,අම්මා ලුසීඩා කියුවම මට හිතෙන්නම ෆලූඩා කියලා. හ...,0,A
3,දින ඒකාබද්ධ විපක්ෂයේ මාධ්‍ය ලේකම් රේණුක පෙරේරා...,0,A
4,පූදින්ෙන නැතුව ඉදිං,1,A


## preprocessing

In [19]:
def prepare_dataset(df,name):
    df = utills.preprocess(df,'comment')
    print(name,len(df))
    X_train,X_test,Y_train,Y_test = train_test_split(df['cleaned'],df['label'], test_size=0.3, random_state=0)
    print("X train {} Y train {} X test {} Y test {}".format(X_train.shape,Y_train.shape,X_test.shape,Y_test.shape))
    return (X_train,X_test,Y_train,Y_test)
    

## Features

In [None]:
# n-gram bag of words

bow_vectorizer = CountVectorizer(analyzer="word", tokenizer=lambda text: text.split())

# TF-IDF
tfidf_vectorizer = TfidfVectorizer(analyzer ='word',tokenizer=lambda text: text.split())

#  word2vec

In [None]:
from gensim.models import Word2Vec
import numpy as np# give a path of model to load function
word_emb_model = Word2Vec.load('word2vec.bin')

## Models

In [10]:
def LR(X_train,X_test,Y_train,vectorizer,feature_name):
    #X_train,X_test,Y_train,Y_test = prepare_dataset(df,'comment')
    lr = LogisticRegression()
    lr_pipe = Pipeline([(feature_name, vectorizer), ('lr', lr)])

    lr_pipe.fit(X_train, Y_train)

    #predictions
    Y_pred = lr_pipe.predict(X_test)
    return (lr_pipe, Y_pred)

 

# def svm()

# def CNN()

In [34]:

def evaluation(name,Y_test,Y_pred):
    
    #report = classification_report(Y_test, Y_pred, target_names=['0','1'])
    #print(report)
    neptune.create_experiment(name)
    neptune.append_tag(['Dataset experiment',name])
    print(Y_test.shape,Y_pred.shape)
    log_class_metrics(Y_test, Y_pred)
    # neptune.init()
    # with neptune.create_experiment():
    #     log_classification_report(y_test, y_test_pred[:,1]>0.5)
    report = classification_report(Y_test, Y_pred, output_dict=True)
    result_df = pd.DataFrame(report).transpose()
    print(result_df)
    return result_df
    

## Dataset Check

In [36]:

def dataset_check(df_list):
    #results = {}
    dfs =[]
    for name,df in df_dict.items():
        X_train,X_test,Y_train,Y_test = prepare_dataset(df,name)
        bow_vectorizer = CountVectorizer(analyzer="word", tokenizer=lambda text: text.split())
        lr_pipe, Y_pred = LR(X_train,X_test,Y_train,bow_vectorizer,'bow')
        result_df = evaluation(name,Y_test,Y_pred)
        result_df['model'] = name
        dfs.append(result_df)
        final_result_df = pd.concat(dfs,ignore_index=True) #ignore_index = True
    return final_result_df
        #evaluation()
        # result {[df.name :{f1score:,accuracy,recall:,precision:, AUC socre}]}
        #results[df].append(result)


## Compare Results

In [37]:
# dataset name list
from neptunecontrib.monitoring.metrics import expand_prediction,log_class_metrics
df_dict = {'df_A': df_A,'df_B': df_B,'df_C' :df_C,'df_D' :df_D,'df_A_B' :df_A_B,'df_A_C' :df_A_C,'df_A_D' :df_A_D}
dataset_check(df_dict)

df_A 6468
X train (4527,) Y train (4527,) X test (1941,) Y test (1941,)
https://app.neptune.ai/codekcg23/Research-Experiments/e/RES-14
(1941,) (1941,)
              precision    recall  f1-score      support
0              0.816099  0.931988  0.870202  1338.000000
1              0.779661  0.533997  0.633858   603.000000
accuracy       0.808346  0.808346  0.808346     0.808346
macro avg      0.797880  0.732992  0.752030  1941.000000
weighted avg   0.804779  0.808346  0.796779  1941.000000
df_B 6345
X train (4441,) Y train (4441,) X test (1904,) Y test (1904,)
https://app.neptune.ai/codekcg23/Research-Experiments/e/RES-15
(1904,) (1904,)
              precision    recall  f1-score      support
0              0.792531  0.879171  0.833606   869.000000
1              0.888298  0.806763  0.845570  1035.000000
accuracy       0.839811  0.839811  0.839811     0.839811
macro avg      0.840414  0.842967  0.839588  1904.000000
weighted avg   0.844589  0.839811  0.840109  1904.000000
df_C 1411
X tr

Unnamed: 0,precision,recall,f1-score,support,model
0,0.816099,0.931988,0.870202,1338.0,df_A
1,0.779661,0.533997,0.633858,603.0,df_A
2,0.808346,0.808346,0.808346,0.808346,df_A
3,0.79788,0.732992,0.75203,1941.0,df_A
4,0.804779,0.808346,0.796779,1941.0,df_A
5,0.792531,0.879171,0.833606,869.0,df_B
6,0.888298,0.806763,0.84557,1035.0,df_B
7,0.839811,0.839811,0.839811,0.839811,df_B
8,0.840414,0.842967,0.839588,1904.0,df_B
9,0.844589,0.839811,0.840109,1904.0,df_B


Unexpected error in ping thread.
Traceback (most recent call last):
  File "C:\Users\Kavishka\anaconda3\lib\site-packages\urllib3\connectionpool.py", line 672, in urlopen
    chunked=chunked,
  File "C:\Users\Kavishka\anaconda3\lib\site-packages\urllib3\connectionpool.py", line 421, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "C:\Users\Kavishka\anaconda3\lib\site-packages\urllib3\connectionpool.py", line 416, in _make_request
    httplib_response = conn.getresponse()
  File "C:\Users\Kavishka\anaconda3\lib\http\client.py", line 1344, in getresponse
    response.begin()
  File "C:\Users\Kavishka\anaconda3\lib\http\client.py", line 306, in begin
    version, status, reason = self._read_status()
  File "C:\Users\Kavishka\anaconda3\lib\http\client.py", line 267, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "C:\Users\Kavishka\anaconda3\lib\socket.py", line 589, in readinto
    return self._sock.recv_into

## Working with neptune

In [None]:
#from neptune.new.types import File

# run["prediction_example"].upload(File.as_image(numpy_array))
# run["results"].upload(File.as_html(df_predictions))

#run['matplotlib-fig'].upload(fig
# from neptune.new.types import File

# fig = ...
# run['visuals/plotly-fig'] = File.as_html(fig)
# Pandas DataFrame 
#run['data/sample'].upload(File.as_html(sample_df))
# run["sys/tags"].add(["run-organization", "me"])



In [38]:
neptune.stop()

Unexpected error in ping thread.
Traceback (most recent call last):
  File "C:\Users\Kavishka\anaconda3\lib\site-packages\urllib3\connection.py", line 157, in _new_conn
    (self._dns_host, self.port), self.timeout, **extra_kw
  File "C:\Users\Kavishka\anaconda3\lib\site-packages\urllib3\util\connection.py", line 61, in create_connection
    for res in socket.getaddrinfo(host, port, family, socket.SOCK_STREAM):
  File "C:\Users\Kavishka\anaconda3\lib\socket.py", line 752, in getaddrinfo
    for res in _socket.getaddrinfo(host, port, family, type, proto, flags):
socket.gaierror: [Errno 11002] getaddrinfo failed

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Users\Kavishka\anaconda3\lib\site-packages\urllib3\connectionpool.py", line 672, in urlopen
    chunked=chunked,
  File "C:\Users\Kavishka\anaconda3\lib\site-packages\urllib3\connectionpool.py", line 376, in _make_request
    self._validate_conn(conn)
  File "C:\Use