In [6]:
import pandas as pd
import numpy as np
import json
import random

In [7]:
n_features = 5000
n_components = 5
n_top_words = 15

In [8]:
#lda_topics = ['talk.politics.mideast', 'rec.sport.hockey', 'soc.religion.christian', 'sci.crypt', 'comp.graphics']

In [605]:
labels = ["Direct Harassment", "Hate Speech","Sexual Harassment","Trolling", "Others", "Toxic"]

batch_files = ['usanews.csv', 'foxnews0.csv', 'foxnews1.csv', 'foxnews2.csv', 'foxnews3.csv', 
                'nogla0.csv','pew0.csv', 'rae0_0.csv', 
                 'rae0_1.csv', 'terror0_0.csv', 'terror0_1.csv']

vid_files = ["foxnews_FULL.csv", "rae_FULL.csv",  "terror_FULL.csv",
             "usanews.csv" , "pew0.csv", "nogla0.csv", "drdisrespect_FULL.csv"]
genre_files = ["gaming_channels_GENRE.csv", "news_channel_GENRE.csv" ]

files = []
files.extend(vid_files)
files.extend(genre_files)
print(files)

['foxnews_FULL.csv', 'rae_FULL.csv', 'terror_FULL.csv', 'usanews.csv', 'pew0.csv', 'nogla0.csv', 'drdisrespect_FULL.csv', 'gaming_channels_GENRE.csv', 'news_channel_GENRE.csv']


# Use here

In [580]:
from scipy.stats import spearmanr
def get_corr_vals(df):
    features = df.columns.difference(labels)
    pvals = {}
    spm = {}
    for label in labels:
        p_values = {}
        sp_coeff = {}
        for f in features:
            #print(f)
            sp, pval = spearmanr(df[label], df[f])
            p_values[f] = pval
            sp_coeff[f] = sp
        pvals[label + "_p-val"] = p_values
        spm[label] = sp_coeff
        
        x = pd.DataFrame.from_dict(pvals)
        y = pd.DataFrame.from_dict(spm)
        z = pd.concat([x,y], axis=1)
        z = z.reindex(sorted(z.columns), axis=1)
    return z

from sklearn.decomposition import NMF
# X - values
# y - columns  **vect.get_feature_names()**
def get_nmf_model(df, X, y):
    model = NMF(n_components=100) # Create an NMF instance: model
    model.fit(X)    # Fit the model to TF-IDF
    nmf_features = model.transform(X) # Transform the TF-IDF: nmf_features
    word_df = pd.DataFrame(model.components_, columns=y)
    
    doc_df = pd.DataFrame(nmf_features)
    
    doc_df[labels] = df[labels]
    #top -> components_df.iloc[topic].nlargest(10)
    return word_df, doc_df


from sklearn.feature_extraction.text import TfidfVectorizer
def get_tfidf_vector(doc):
    vect = TfidfVectorizer()
    X = vect.fit_transform(doc) 
    y = vect.get_feature_names()
    return X, y

In [581]:
#https://python-bloggers.com/2021/01/topic-modelling-with-nmf-in-python/
src_folder = "../Annotations/CSVs/"
dest_folder = { "WORD": "NMF_WORD_TOPIC/", "DOC": "NMF_DOC_TOPIC/"}
for file in genre_files:
    df = pd.read_csv(src_folder + file, index_col = 0)
    df["words"] = df["words"].replace(np.nan, '', regex=True).apply(str)
    #NMF TOPIC MODELING
    X,y = get_tfidf_vector(df["words"])
    word_df, topic_df = get_nmf_model(df,X,y)
    print(f' topic_word: {word_df.shape},  doc_topic: {topic_df.shape}')
    #SAVE
    word_df.to_csv(dest_folder["WORD"] + file)
    topic_df.to_csv(dest_folder["DOC"] + file)

  exec(code_obj, self.user_global_ns, self.user_ns)


 topic_word: (100, 10884),  doc_topic: (33069, 106)
 topic_word: (100, 9928),  doc_topic: (20677, 106)


# Correlation Coefficients and P-Value

In [584]:
src_folder = { "WORD": "NMF_WORD_TOPIC/", "DOC": "NMF_DOC_TOPIC/"}
dest_folder = "TOPIC_DOC_CORR/"
for file in genre_files:
    topic_df = pd.read_csv(src_folder["DOC"] + file, index_col = 0)
    corr_df = get_corr_vals(topic_df)
    corr_df = corr_df.reset_index(drop=False)
    corr_df = corr_df.rename(columns={'index': 'Topic'})
    display(corr_df)
    corr_df.to_csv(dest_folder + file)

Unnamed: 0,Topic,Direct Harassment,Direct Harassment_p-val,Hate Speech,Hate Speech_p-val,Others,Others_p-val,Sexual Harassment,Sexual Harassment_p-val,Toxic,Toxic_p-val,Trolling,Trolling_p-val
0,0,0.019779,0.000322,-0.014920,6.664506e-03,0.007302,0.184245,0.007155,0.193230,0.014130,1.018344e-02,-0.002764,0.615287
1,1,0.001906,0.728927,-0.019628,3.575451e-04,0.002838,0.605853,-0.007585,0.167831,0.004919,3.710784e-01,-0.004476,0.415675
2,10,0.005659,0.303454,0.001947,7.233423e-01,0.001155,0.833712,-0.000867,0.874793,0.001906,7.288442e-01,0.010586,0.054228
3,11,0.022666,0.000038,0.144820,1.929702e-154,0.003124,0.569965,0.016396,0.002866,0.038033,4.561712e-12,0.022189,0.000054
4,12,0.022192,0.000054,-0.013822,1.195318e-02,-0.003590,0.513840,0.000057,0.991756,0.010355,5.970585e-02,0.012781,0.020111
...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,95,0.008154,0.138142,-0.005079,3.556609e-01,-0.004845,0.378276,-0.007814,0.155352,0.004244,4.402233e-01,-0.005023,0.361051
96,96,0.022787,0.000034,-0.006694,2.234717e-01,0.001825,0.739959,0.007070,0.198570,0.013457,1.439578e-02,0.014515,0.008300
97,97,0.023818,0.000015,0.004133,4.522653e-01,-0.004758,0.386959,0.001244,0.821053,0.010643,5.293500e-02,0.003011,0.584020
98,98,0.008821,0.108701,-0.017819,1.192851e-03,0.002684,0.625487,-0.002722,0.620679,0.002183,6.914246e-01,-0.005448,0.321868


Unnamed: 0,Topic,Direct Harassment,Direct Harassment_p-val,Hate Speech,Hate Speech_p-val,Others,Others_p-val,Sexual Harassment,Sexual Harassment_p-val,Toxic,Toxic_p-val,Trolling,Trolling_p-val
0,0,-0.023375,7.751542e-04,-0.011811,8.945374e-02,0.003316,0.633460,-0.015255,0.028263,-0.013334,5.519728e-02,-0.012185,7.975170e-02
1,1,-0.001092,8.752703e-01,0.008363,2.291755e-01,0.011369,0.102103,0.008158,0.240811,0.013869,4.612013e-02,0.020141,3.776053e-03
2,10,-0.012773,6.626987e-02,0.003864,5.784995e-01,-0.001266,0.855508,-0.010726,0.123001,-0.002433,7.264151e-01,0.007858,2.585273e-01
3,11,0.045642,5.166531e-11,0.027858,6.165774e-05,0.024543,0.000416,0.002279,0.743187,0.047999,5.000917e-12,0.043728,3.165611e-10
4,12,0.011155,1.087240e-01,0.017046,1.423889e-02,0.010697,0.124028,0.001729,0.803667,0.020343,3.440756e-03,0.028436,4.323154e-05
...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,95,0.057530,1.244038e-16,0.059983,6.000836e-18,0.033148,0.000002,0.007274,0.295617,0.056029,7.469264e-16,0.061924,4.996904e-19
96,96,0.035241,4.002953e-07,0.032627,2.698747e-06,0.026780,0.000117,0.004747,0.494896,0.035161,4.254698e-07,0.052826,2.940075e-14
97,97,0.022178,1.426087e-03,0.028424,4.354392e-05,0.028864,0.000033,-0.008654,0.213358,0.027052,1.000598e-04,0.055769,1.015069e-15
98,98,0.022599,1.154531e-03,0.011797,8.982082e-02,0.010257,0.140243,0.018995,0.006306,0.017972,9.756926e-03,0.033481,1.468504e-06


# Get Toxic p-val < 0.05

In [607]:
src_folder = "TOPIC_DOC_CORR/"
dest_folder = "RESULTS/"
labels = labels
for file in genre_files:
    corr_df = pd.read_csv(src_folder + file, index_col = 0)
    corr_df = corr_df.round(4)
    toxic_df = corr_df[(corr_df["Toxic" + "_p-val"] < 0.05) ]
    
    print(labels)
    for label in labels:
        topic = toxic_df[ (toxic_df[ label + "_p-val" ] > 0.05) ]["Topic"].to_list() #high p value index
        if len(topic) > 0:
            toxic_df[label][toxic_df["Topic"].isin(topic)] = "-"
    col = ["Topic"]
    col.extend(labels)
    toxic_df[col].to_csv(dest_folder + file)

['Direct Harassment', 'Hate Speech', 'Sexual Harassment', 'Trolling', 'Others', 'Toxic']
['Direct Harassment', 'Hate Speech', 'Sexual Harassment', 'Trolling', 'Others', 'Toxic']


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  toxic_df[label][toxic_df["Topic"].isin(topic)] = "-"
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return self._update_inplace(result)


In [612]:
src_folder = {"RESULTS": "RESULTS/", "WORD": "NMF_WORD_TOPIC/"}
dest_folder = "RESULTS/"

for file in genre_files:
    toxic_df = pd.read_csv(src_folder["RESULTS"] + file, index_col = 0)
    word_df = pd.read_csv(src_folder["WORD"] + file, index_col = 0)
    toxic_df["Topic Top 10 Keywords"] = ""
    for topic in toxic_df["Topic"]:
        keywords = word_df.iloc[ topic ].nlargest(10).index.to_list()
        i = list(toxic_df[ toxic_df["Topic"] == topic].index)[0]
        toxic_df.at[i, 'Topic Top 10 Keywords'] = keywords
    toxic_df = toxic_df.reset_index(drop=True)
    toxic_df.to_csv(dest_folder+file)