In [1]:
import pandas as pd
import numpy as np
import nltk
import time
import matplotlib.pyplot as plt
import seaborn as sns
import os, glob
import csv
from functools import reduce
import re
import string

In [2]:
import numpy as np
labels = ["Direct Harassment", "Hate Speech","Sexual Harassment","Trolling", "Others", "Toxic"]

batch_files = ['usanews.csv', 'foxnews0.csv', 'foxnews1.csv', 'foxnews2.csv', 'foxnews3.csv', 
                'nogla0.csv','pew0.csv', 'rae0_0.csv', 
                 'rae0_1.csv', 'terror0_0.csv', 'terror0_1.csv']

vid_files = ["foxnews_FULL.csv", "rae_FULL.csv",  "terror_FULL.csv",
             "usanews.csv" , "pew0.csv", "nogla0.csv", "drdisrespect_FULL.csv"]
genre_files = ["gaming_channels_GENRE.csv", "news_channel_GENRE.csv" ]

files = []
files.extend(vid_files)
files.extend(genre_files)
print(files)


['foxnews_FULL.csv', 'rae_FULL.csv', 'terror_FULL.csv', 'usanews.csv', 'pew0.csv', 'nogla0.csv', 'drdisrespect_FULL.csv', 'gaming_channels_GENRE.csv', 'news_channel_GENRE.csv']


In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
def get_tfidf_vect(documents, a, b):
    vectorizer = TfidfVectorizer(ngram_range=(a,b), max_df=0.85, analyzer = 'word', min_df=1, max_features=10000, stop_words='english', lowercase=True)
    vector = vectorizer.fit_transform(documents)
    feature_names = np.array(vectorizer.get_feature_names())
    return pd.DataFrame(vector.todense(), columns=feature_names), vector, feature_names

In [4]:
def get_combined_tfidf_values_per_label(tfidf_df):
    combined_tfidf = pd.DataFrame(columns= (list(features) + labels))
    for label in labels:
        combined_scores = tfidf_df[tfidf_df[label] == 1][features].sum()
        to_append = list(combined_scores) + [0,0,0,0,0,0]
        row = pd.Series(to_append, index = (list(features) + labels))
        combined_tfidf = combined_tfidf.append(row, ignore_index=True)
        combined_tfidf.loc[combined_tfidf.index[-1],label] = 1.0
    return combined_tfidf

In [5]:
from scipy.stats import spearmanr
def get_corr_vals(df):
    features = df.columns.difference(labels)
    pvals = {}
    spm = {}
    for label in labels:
        p_values = {}
        sp_coeff = {}
        for f in features:
            #print(f)
            sp, pval = spearmanr(df[label], df[f])
            p_values[f] = pval
            sp_coeff[f] = sp
        pvals[label + " p-val"] = p_values
        spm[label + " corr coeff"] = sp_coeff
        
        x = pd.DataFrame.from_dict(pvals)
        y = pd.DataFrame.from_dict(spm)
        z = pd.concat([x,y], axis=1)
        z = z.reindex(sorted(z.columns), axis=1)
    return z


In [13]:
src_folder = "../Annotations/CSVs/"
dest_folder = "tfidf_scores_WORDS_per_video/"

for file in files:
    #read file
    df = pd.read_csv(src_folder + file, index_col = 0)
    #get words, POS SEQ
    df["words"] = df["words"].replace(np.nan, '', regex=True)
    
    #tfidf of words
    tfidf_df, vector, features = get_tfidf_vect(df['words'], 2,3)
    tfidf_df[labels] = df[labels] #add tthe labels
    
    #combine tfidf values per label
    combined_tfidf = get_combined_tfidf_values_per_label(tfidf_df)
    combined_tfidf.to_csv(dest_folder + file)
    

In [None]:
#pd.concat([d.reset_index(drop=True) for d in [df1, df2]], axis=1)

# Word/s Correlation

In [6]:
src_folder = "tfidf_scores_WORDS_per_video/"
dest_folder = "corr_words_per_label/"
for file in files:
    #read file
    df = pd.read_csv(src_folder + file, index_col = 0)
    df = get_corr_vals(df)
    df.to_csv("corr_words_per_label/" + file)



# POS Tag TFIDF

In [13]:
files = ["foxnews_FULL.csv", "rae_FULL.csv",  "terror_FULL.csv",
             "usanews.csv" , "pew0.csv", "nogla0.csv", "gaming_channels_GENRE.csv", "news_channel_GENRE.csv" , "drdisrespect_FULL.csv"]

src_folder = "../Annotations/CSVs/"
dest_folder = "tfidf_scores_POS_per_video/"

for file in files:
    #read file
    df = pd.read_csv(src_folder + file, index_col = 0)
    #get words, POS SEQ
    df["POS-SEQ"] = df["POS-SEQ"].replace(np.nan, '', regex=True)
    
    #tfidf of words
    tfidf_df, vector, features = get_tfidf_vect(df['POS-SEQ'], 2,3)
    tfidf_df[labels] = df[labels] #add tthe labels
    
    #combine tfidf values per label
    combined_tfidf = get_combined_tfidf_values_per_label(tfidf_df)
    combined_tfidf.to_csv(dest_folder + file)
    

# POS TAG Correlational

In [14]:
src_folder = "tfidf_scores_POS_per_video/"
dest_folder = "corr_pos_seq_per_label/"

for file in files:
    #read file
    df = pd.read_csv(src_folder + file, index_col = 0)
    df = get_corr_vals(df)
    df.to_csv(dest_folder + file)