In [None]:
"""
This notebook is used to compute sentiment 
distribution score of the predicted emo-denoting words 
based on their distance to the top words represented 
in each modularity class.
"""

In [1]:
import pandas as pd 
import networkx as nx
import numpy as np
from scipy.special import softmax
from collections import defaultdict
from networkx.algorithms.shortest_paths.generic import shortest_path_length as spl

In [2]:
## create undirected network
"""
The following are needed for this notebook:
1) Path to edge_list
2) Path to node_list 
3) Exported gephi output csv file
4) Non-aggregated detected emotion words - pref_with_rfc.csv (from Emo-denoting Prediction without aggregated + corpus)

"""
#Instantiate new Graph object
Graphtype = nx.Graph()

#Path for all input files
edge_list_path = "../source_data/second_iteration_edge_list_updated.csv"
node_list_path = "../source_data/second_iteration_nodes_updated.csv"
gephi_output_csv_path = "../source_data/gephi_output_cleaned.csv"
output_folder="../output/"
emo_pred_file_path = "pred_with_rfc.csv"


## select the top k words from each modularity_class to represent the class
top_k_words = 10

### build the network
edge_list = pd.read_csv(edge_list_path)
node_list = pd.read_csv(node_list_path)
syn_G = nx.from_pandas_edgelist(edge_list, edge_attr='weight', create_using=Graphtype)

In [3]:
## read gephi output graph for computed distance, centrality, authority values
network = pd.read_csv(gephi_output_csv_path)
network.reset_index()
network.set_index(network['Id'],inplace=True)
mapping =network['Label'].to_dict()
G = nx.relabel_nodes(syn_G, mapping, copy=False)

In [6]:
## Sorts network by Authority scores and groups by modularity class
selected_top = network.sort_values('Authority',ascending=False).groupby('modularity_class').head(top_k_words)
## generate dic with cluster number as key and representing words list as the value
selected_top_dic = selected_top.groupby('modularity_class')['Label'].agg(list).to_dict()
### import the emo_pred_output from Emo-denoting prediction notebook
pred = pd.read_csv('pred_with_rfc.csv')

In [11]:

def emo_distribution_cal(top_cluster_word_dict = selected_top_dic,source=None,network_Graph=G,row=None):
    """
    Parameters
    ----------
    top_cluster_word_dict: dictionary,
        Dictionary of the top k words for each modularity class
    
    source: str,
        The source word - default is None
        
    network_Graph: networkX Graph,
        NetworkX Graph object
    
    row: 
        
    output softmax function value for the probability of the emotion based on the score 
    """
    emo_score_dist = []
    if row['emo?']:
        source=row['word'].split('_')[0]
    else:
        return 
    for key, top_words in top_cluster_word_dict.items():
        total_distance = 0
        count =0
        ### loop for each top word in one cluster
        for top_word in top_words:
            try: 
                """
                spl retrieves the shortest distance from the source to the top_word in the network
                """
                total_distance += spl(network_Graph,source,top_word)
                count+=1
            except:
                """
                when either source or top_word in the one cluster not found in the network 
                or they are not reachable => just pass 
                """
                pass
        ### shorter distance between nodes => closer the relationship => 1/ avg_distance of the cluster
        if count !=0:
            avg_distance = total_distance/count
            emo_score_dist.append(1/avg_distance) 
        else:
            emo_score_dist.append(0)
        
    return softmax(emo_score_dist)


In [13]:
### emotion score distribution of all the clusters
total_emo_res = pred.apply(lambda x: emo_distribution_cal(row=x),axis=1)
pred["emo_dist_prob"] = total_emo_res

def emo_cluster_assignment(row):
    if row['emo?']==1:
        return np.array(row['emo_dist_prob']).argsort()[::-1]
    else:
        return 0

pred['emo_dist_cluster_order']=pred.apply(lambda x: emo_cluster_assignment(x),axis=1)


In [9]:
### output results
pred.to_csv('emo_assignment.csv',index=False)

In [11]:
emo_df = pd.read_csv('emo_assignment.csv')

In [73]:
emo_subset = emo_df[['emo?', 'word', 'emo_dist_prob', 'emo_dist_cluster_order']]

In [59]:
main_df = pd.read_csv('res.csv')
main_df.head()

Unnamed: 0,index,prediction,cleaned_index,Label,self_auth,self_class,self_deg,self_betcent,pred_betcent,pred_auth,pred_deg,pred_class,string,from_textid
0,consistently_2,"['also', 'being', 'actually', 'currently', 'st...",consistently,consistently,0.000209,25.0,10.0,0.0,"[0.0, 0.0, 0.0, -1, 155965.9877]","[0.000391, 0.000651, 0.000391, -1, 0.000269]","[33, 125, 7, -1, 136]","[9, 3, 21, -1, 16]",Saxobank is consistently rated very poorly fro...,0
1,rated_3,"['performing', 'doing', 'functioning', 'workin...",rated,,,,,,"[-1, 174890.4158, -1, 481.203593, 47866.10398]","[-1, 0.001086, -1, 0.001485, 0.000102]","[-1, 169, -1, 19, 9]","[-1, 25, -1, 8, 9]",Saxobank is consistently rated very poorly fro...,0
2,poorly_5,"['highly', 'high', 'low', 'well', 'poorly']",poorly,poorly,0.001552,11.0,10.0,194.873669,"[1816.335264, 0.0, 39966.12492, 0.0, 194.873669]","[0.015114, 0.00691, 0.005939, 0.0034, 0.001552]","[176, 198, 173, 135, 10]","[25, 23, 11, 5, 11]",Saxobank is consistently rated very poorly fro...,0
3,seen_10,"['written', 'done', 'read', 'said']",seen,,,,,,"[-1, 0.0, 35.951423, -1]","[-1, 0.002634, 1.1e-05, -1]","[-1, 18, 8, -1]","[-1, 17, 0, -1]",Saxobank is consistently rated very poorly fro...,0
4,contradicting_17,"['but', 'and']",contradicting,,,,,,"[6263.708295, 10269.73499]","[8.6e-05, 0.000116]","[26, 10]","[18, 8]",Saxobank is consistently rated very poorly fro...,0


In [74]:
main_df.shape, emo_subset.shape

((267092, 14), (77026, 4))

In [75]:
combined_df = main_df.merge(emo_subset, how='left', left_on='index', right_on='word').drop_duplicates()

In [76]:
combined_df.shape

(295781, 18)

In [82]:
combined_df['emo_dist_cluster_order'] = combined_df['emo_dist_cluster_order'].fillna(-1)
combined_df['emo_dist_prob'] = combined_df['emo_dist_prob'].fillna(-1)

In [86]:
combined_df['emo_dist_cluster_order'] = combined_df['emo_dist_cluster_order'].replace('0', -1)

In [94]:
cleaned_df = combined_df.dropna()

In [134]:
cleaned_df.head()

Unnamed: 0,index,prediction,cleaned_index,Label,self_auth,self_class,self_deg,self_betcent,pred_betcent,pred_auth,pred_deg,pred_class,string,from_textid,emo?,word,emo_dist_prob,emo_dist_cluster_order
0,consistently_2,"['also', 'being', 'actually', 'currently', 'st...",consistently,consistently,0.000209,25.0,10.0,0.0,"[0.0, 0.0, 0.0, -1, 155965.9877]","[0.000391, 0.000651, 0.000391, -1, 0.000269]","[33, 125, 7, -1, 136]","[9, 3, 21, -1, 16]",Saxobank is consistently rated very poorly fro...,0,0.0,consistently_2,-1,-1
5,poorly_5,"['highly', 'high', 'low', 'well', 'poorly']",poorly,poorly,0.001552,11.0,10.0,194.873669,"[1816.335264, 0.0, 39966.12492, 0.0, 194.873669]","[0.015114, 0.00691, 0.005939, 0.0034, 0.001552]","[176, 198, 173, 135, 10]","[25, 23, 11, 5, 11]",Saxobank is consistently rated very poorly fro...,0,1.0,poorly_5,[0.03844286 0.03785371 0.03911434 0.03844286 0...,[20 9 11 2 22 7 13 19 16 25 3 5 8 0 17 ...
10,point_19,"['pointing', 'pointed', 'point', 'points', 'fi...",point,point,0.002054,24.0,242.0,0.0,"[-1, -1, 0.0, -1, -1]","[-1, -1, 0.002054, -1, -1]","[-1, -1, 242, -1, -1]","[-1, -1, 24, -1, -1]",Saxobank is consistently rated very poorly fro...,0,0.0,point_19,-1,-1
26,sense_28,"['good', 'quality', 'general', 'new', 'strict']",sense,sense,0.000308,4.0,116.0,0.0,"[0.0, 0.0, 10972.03876, 2012.54884, 287.247587]","[0.013005, 0.101255, 0.000135, 0.001431, 0.000...","[398, 229, 13, 59, 43]","[18, 17, 5, 10, 12]",Saxobank is consistently rated very poorly fro...,0,0.0,sense_28,-1,-1
46,entire_7,"['new', 'old', 'entire', 'former', 'current']",entire,entire,0.002084,5.0,84.0,0.0,"[2012.54884, 0.0, 0.0, 0.0, 3153.12649]","[0.001431, 0.000349, 0.002084, 0.000104, 0.001...","[59, 66, 84, 22, 61]","[10, 10, 5, 15, 8]",I wonder what would happen if the entire WSB a...,4,0.0,entire_7,-1,-1


In [118]:
cleaned_df['emo_dist_cluster_order'] = cleaned_df['emo_dist_cluster_order'].to_list()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_df['emo_dist_cluster_order'] = cleaned_df['emo_dist_cluster_order'].to_list()


In [119]:
cleaned_df['emo_dist_cluster_order'].dtype

dtype('O')

In [127]:
testing = cleaned_df.iloc[1][17]

In [154]:
def list_mapping(string):
    res = list(map(float, string[1:-1].split()))
    return res

In [144]:
cleaned_df['emo_dist_cluster_order'] = cleaned_df['emo_dist_cluster_order'].apply(lambda x: list_mapping(x) if x != -1 else -1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_df['emo_dist_cluster_order'] = cleaned_df['emo_dist_cluster_order'].apply(lambda x: list_mapping(x) if x != -1 else -1)


In [155]:
cleaned_df['emo_dist_prob'] = cleaned_df['emo_dist_prob'].apply(lambda x: list_mapping(x) if x != -1 else -1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_df['emo_dist_prob'] = cleaned_df['emo_dist_prob'].apply(lambda x: list_mapping(x) if x != -1 else -1)


In [156]:
cleaned_df['emo_dist_prob'] = cleaned_df['emo_dist_prob'].to_list()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_df['emo_dist_prob'] = cleaned_df['emo_dist_prob'].to_list()


In [146]:
cleaned_df['emo_dist_cluster_order'] = cleaned_df['emo_dist_cluster_order'].to_list()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_df['emo_dist_cluster_order'] = cleaned_df['emo_dist_cluster_order'].to_list()


In [157]:
cleaned_df.head()

Unnamed: 0,index,prediction,cleaned_index,Label,self_auth,self_class,self_deg,self_betcent,pred_betcent,pred_auth,pred_deg,pred_class,string,from_textid,emo?,word,emo_dist_prob,emo_dist_cluster_order
0,consistently_2,"['also', 'being', 'actually', 'currently', 'st...",consistently,consistently,0.000209,25.0,10.0,0.0,"[0.0, 0.0, 0.0, -1, 155965.9877]","[0.000391, 0.000651, 0.000391, -1, 0.000269]","[33, 125, 7, -1, 136]","[9, 3, 21, -1, 16]",Saxobank is consistently rated very poorly fro...,0,0.0,consistently_2,-1,-1
5,poorly_5,"['highly', 'high', 'low', 'well', 'poorly']",poorly,poorly,0.001552,11.0,10.0,194.873669,"[1816.335264, 0.0, 39966.12492, 0.0, 194.873669]","[0.015114, 0.00691, 0.005939, 0.0034, 0.001552]","[176, 198, 173, 135, 10]","[25, 23, 11, 5, 11]",Saxobank is consistently rated very poorly fro...,0,1.0,poorly_5,"[0.03844286, 0.03785371, 0.03911434, 0.0384428...","[20, 9, 11, 2, 22, 7, 13, 19, 16, 25, 3, 5, 8,..."
10,point_19,"['pointing', 'pointed', 'point', 'points', 'fi...",point,point,0.002054,24.0,242.0,0.0,"[-1, -1, 0.0, -1, -1]","[-1, -1, 0.002054, -1, -1]","[-1, -1, 242, -1, -1]","[-1, -1, 24, -1, -1]",Saxobank is consistently rated very poorly fro...,0,0.0,point_19,-1,-1
26,sense_28,"['good', 'quality', 'general', 'new', 'strict']",sense,sense,0.000308,4.0,116.0,0.0,"[0.0, 0.0, 10972.03876, 2012.54884, 287.247587]","[0.013005, 0.101255, 0.000135, 0.001431, 0.000...","[398, 229, 13, 59, 43]","[18, 17, 5, 10, 12]",Saxobank is consistently rated very poorly fro...,0,0.0,sense_28,-1,-1
46,entire_7,"['new', 'old', 'entire', 'former', 'current']",entire,entire,0.002084,5.0,84.0,0.0,"[2012.54884, 0.0, 0.0, 0.0, 3153.12649]","[0.001431, 0.000349, 0.002084, 0.000104, 0.001...","[59, 66, 84, 22, 61]","[10, 10, 5, 15, 8]",I wonder what would happen if the entire WSB a...,4,0.0,entire_7,-1,-1


In [161]:
exploded_df = cleaned_df.apply(pd.Series.explode)

In [162]:
exploded_df.to_csv('emo_exploded_df.csv', index=False)

In [163]:
exploded_df.head(25)

Unnamed: 0,index,prediction,cleaned_index,Label,self_auth,self_class,self_deg,self_betcent,pred_betcent,pred_auth,pred_deg,pred_class,string,from_textid,emo?,word,emo_dist_prob,emo_dist_cluster_order
0,consistently_2,"['also', 'being', 'actually', 'currently', 'st...",consistently,consistently,0.000209,25.0,10.0,0.0,"[0.0, 0.0, 0.0, -1, 155965.9877]","[0.000391, 0.000651, 0.000391, -1, 0.000269]","[33, 125, 7, -1, 136]","[9, 3, 21, -1, 16]",Saxobank is consistently rated very poorly fro...,0,0.0,consistently_2,-1.0,-1
5,poorly_5,"['highly', 'high', 'low', 'well', 'poorly']",poorly,poorly,0.001552,11.0,10.0,194.873669,"[1816.335264, 0.0, 39966.12492, 0.0, 194.873669]","[0.015114, 0.00691, 0.005939, 0.0034, 0.001552]","[176, 198, 173, 135, 10]","[25, 23, 11, 5, 11]",Saxobank is consistently rated very poorly fro...,0,1.0,poorly_5,0.0384429,20
5,poorly_5,"['highly', 'high', 'low', 'well', 'poorly']",poorly,poorly,0.001552,11.0,10.0,194.873669,"[1816.335264, 0.0, 39966.12492, 0.0, 194.873669]","[0.015114, 0.00691, 0.005939, 0.0034, 0.001552]","[176, 198, 173, 135, 10]","[25, 23, 11, 5, 11]",Saxobank is consistently rated very poorly fro...,0,1.0,poorly_5,0.0378537,9
5,poorly_5,"['highly', 'high', 'low', 'well', 'poorly']",poorly,poorly,0.001552,11.0,10.0,194.873669,"[1816.335264, 0.0, 39966.12492, 0.0, 194.873669]","[0.015114, 0.00691, 0.005939, 0.0034, 0.001552]","[176, 198, 173, 135, 10]","[25, 23, 11, 5, 11]",Saxobank is consistently rated very poorly fro...,0,1.0,poorly_5,0.0391143,11
5,poorly_5,"['highly', 'high', 'low', 'well', 'poorly']",poorly,poorly,0.001552,11.0,10.0,194.873669,"[1816.335264, 0.0, 39966.12492, 0.0, 194.873669]","[0.015114, 0.00691, 0.005939, 0.0034, 0.001552]","[176, 198, 173, 135, 10]","[25, 23, 11, 5, 11]",Saxobank is consistently rated very poorly fro...,0,1.0,poorly_5,0.0384429,2
5,poorly_5,"['highly', 'high', 'low', 'well', 'poorly']",poorly,poorly,0.001552,11.0,10.0,194.873669,"[1816.335264, 0.0, 39966.12492, 0.0, 194.873669]","[0.015114, 0.00691, 0.005939, 0.0034, 0.001552]","[176, 198, 173, 135, 10]","[25, 23, 11, 5, 11]",Saxobank is consistently rated very poorly fro...,0,1.0,poorly_5,0.038139,22
5,poorly_5,"['highly', 'high', 'low', 'well', 'poorly']",poorly,poorly,0.001552,11.0,10.0,194.873669,"[1816.335264, 0.0, 39966.12492, 0.0, 194.873669]","[0.015114, 0.00691, 0.005939, 0.0034, 0.001552]","[176, 198, 173, 135, 10]","[25, 23, 11, 5, 11]",Saxobank is consistently rated very poorly fro...,0,1.0,poorly_5,0.0384429,7
5,poorly_5,"['highly', 'high', 'low', 'well', 'poorly']",poorly,poorly,0.001552,11.0,10.0,194.873669,"[1816.335264, 0.0, 39966.12492, 0.0, 194.873669]","[0.015114, 0.00691, 0.005939, 0.0034, 0.001552]","[176, 198, 173, 135, 10]","[25, 23, 11, 5, 11]",Saxobank is consistently rated very poorly fro...,0,1.0,poorly_5,0.0378537,13
5,poorly_5,"['highly', 'high', 'low', 'well', 'poorly']",poorly,poorly,0.001552,11.0,10.0,194.873669,"[1816.335264, 0.0, 39966.12492, 0.0, 194.873669]","[0.015114, 0.00691, 0.005939, 0.0034, 0.001552]","[176, 198, 173, 135, 10]","[25, 23, 11, 5, 11]",Saxobank is consistently rated very poorly fro...,0,1.0,poorly_5,0.0391143,19
5,poorly_5,"['highly', 'high', 'low', 'well', 'poorly']",poorly,poorly,0.001552,11.0,10.0,194.873669,"[1816.335264, 0.0, 39966.12492, 0.0, 194.873669]","[0.015114, 0.00691, 0.005939, 0.0034, 0.001552]","[176, 198, 173, 135, 10]","[25, 23, 11, 5, 11]",Saxobank is consistently rated very poorly fro...,0,1.0,poorly_5,0.0384429,16


In [64]:
#final_df = combined_df[~combined_df['emo_dist_prob'].isnull()]

In [71]:
#print(final_df.shape)
#final_df.head(10)

#emo_dist_prob - explode two columns and export to bigquery - convert arrays to list and explode them
#naming conventions, import exports, SQL, tableau
#reddit tags (no emotions = -1)



(36390, 17)


Unnamed: 0,index,prediction,cleaned_index,Label,self_auth,self_class,self_deg,self_betcent,pred_betcent,pred_auth,pred_deg,pred_class,string,from_textid,word,emo_dist_prob,emo_dist_cluster_order
5,poorly_5,"['highly', 'high', 'low', 'well', 'poorly']",poorly,poorly,0.001552,11.0,10.0,194.873669,"[1816.335264, 0.0, 39966.12492, 0.0, 194.873669]","[0.015114, 0.00691, 0.005939, 0.0034, 0.001552]","[176, 198, 173, 135, 10]","[25, 23, 11, 5, 11]",Saxobank is consistently rated very poorly fro...,0,poorly_5,[0.03844286 0.03785371 0.03911434 0.03844286 0...,[20 9 11 2 22 7 13 19 16 25 3 5 8 0 17 ...
50,entire_7,"['new', 'old', 'entire', 'former', 'current']",entire,entire,0.002084,5.0,84.0,0.0,"[2012.54884, 0.0, 0.0, 0.0, 3153.12649]","[0.001431, 0.000349, 0.002084, 0.000104, 0.001...","[59, 66, 84, 22, 61]","[10, 10, 5, 15, 8]",I wonder what would happen if the entire WSB a...,4,entire_7,[0.03785458 0.03701028 0.03888225 0.03851578 0...,[19 17 11 5 8 14 2 16 13 25 3 4 6 10 12 ...
65,mean_2,"['to', 'can', 'ets', 'that']",mean,mean,0.102042,17.0,423.0,97091.2793,"[104792.6541, 0.0, -1, 44742.32375]","[0.000248, 0.000348, -1, 0.01506]","[31, 144, -1, 155]","[19, 4, -1, 25]",Diamond hands mean sell gme calls earlier lol,7,mean_2,[0.0377439 0.03859299 0.03907121 0.03907121 0...,[19 11 20 10 13 2 3 25 1 9 24 12 17 18 22 ...
104,Amazing_0,"['p', 'vol', 'math', 'soc', 'trans']",amazing,amazing,0.005175,17.0,11.0,0.0,"[-1, -1, -1, -1, -1]","[-1, -1, -1, -1, -1]","[-1, -1, -1, -1, -1]","[-1, -1, -1, -1, -1]",Amazing,8,Amazing_0,[0.03846154 0.03846154 0.03846154 0.03846154 0...,[25 24 1 2 3 4 5 6 7 8 9 10 11 12 13 ...
114,safe_28,"['for', 'with', 'and', 'on', 'of']",safe,safe,0.000259,4.0,14.0,70.224625,"[0.0, 11410.48432, 10269.73499, 0.0, 0.0]","[0.000251, 0.000332, 0.000116, 0.002862, 4.7e-05]","[58, 48, 10, 199, 6]","[14, 9, 8, 9, 9]",#***X5*** The above user [samthesuperman](http...,9,safe_28,[0.0386386 0.03758008 0.03840372 0.03840372 0...,[ 6 8 17 10 11 19 9 14 0 4 13 2 3 25 15 ...
126,Man_0,"['and', 'so', 'maybe', 'but']",man,man,0.000393,3.0,100.0,0.0,"[10269.73499, 2087.67817, 0.0, 6263.708295]","[0.000116, 0.015486, 9e-06, 8.6e-05]","[10, 222, 19, 26]","[8, 25, 21, 18]","Man I’m right there with ya, but April calls i...",11,Man_0,[0.03846154 0.03846154 0.03846154 0.03846154 0...,[25 24 1 2 3 4 5 6 7 8 9 10 11 12 13 ...
186,right_3,"['there', 'here', 'along', 'back', 'up']",right,right,0.017794,25.0,194.0,9.157982,"[-1, -1, 21246.36983, 0.0, 0.0]","[-1, -1, 0.001353, 0.000403, 0.016058]","[-1, -1, 39, 76, 634]","[-1, -1, 9, 4, 9]","Man I’m right there with ya, but April calls i...",11,right_3,[0.03765279 0.03765279 0.03790772 0.03790772 0...,[ 8 18 17 25 5 13 11 14 6 19 23 10 4 7 9 ...
228,Good_13,"['thanks', 'one', 'hello', 'love', 'luck']",good,good,0.013005,18.0,398.0,0.0,"[-1, 0.0, -1, 57973.51769, -1]","[-1, 0.000232, -1, 0.000218, -1]","[-1, 24, -1, 84, -1]","[-1, 10, -1, 18, -1]","Man I’m right there with ya, but April calls i...",11,Good_13,[0.03846154 0.03846154 0.03846154 0.03846154 0...,[25 24 1 2 3 4 5 6 7 8 9 10 11 12 13 ...
243,low_15,"['price', 'cost', 'rate', 'wage', 'rent']",low,low,0.005939,11.0,173.0,39966.12492,"[-1, 0.0, 0.0, -1, 0.0]","[-1, 4.5e-05, 0.001493, -1, 6.8e-05]","[-1, 7, 77, -1, 12]","[-1, 4, 6, -1, 6]",Right. So you’re saying you expect a major exi...,14,low_15,[0.03806923 0.03806923 0.03771042 0.03845493 0...,[11 20 8 23 16 4 13 22 19 17 18 3 5 25 1 ...
273,obviously_1,"['just', 'probably', 'obviously', 'certainly',...",obviously,obviously,1.6e-05,21.0,42.0,0.0,"[0.0, 163.77019, 0.0, 11953.84237, 149430.1137]","[0.003158, 7e-05, 1.6e-05, 0.000521, 0.01523]","[168, 17, 42, 44, 204]","[18, 21, 21, 21, 25]",You obviously didn't read the DD... Short the ...,15,obviously_1,[0.03759933 0.03759933 0.03856089 0.03932219 0...,[13 10 17 11 3 14 25 22 19 8 23 2 4 9 12 ...


In [70]:
final_df['from_textid'].unique()

array([    0,     4,     7, ..., 48779, 48780, 48781])