In [None]:
import pandas as pd

In [2]:
# TODO:
# most common words by different categories
# lda topic distribution by different categories
# better filtering: exclude text with '(number)' or '(LETTER)' or 'as follows:'

# DONE:
# duplicate speech elemination, report its count
# some speeches are like law text submitted the following e.g. df.contents.iloc[845]
# related with above: delete extra spaces in speech (probably they occur in pure law texts)
# text token len in average and per speaker, and per party
# better filtering: exclude too long texts from via max token_count 

### Preprocessing

In [None]:
train_df = pd.read_csv('/cluster/scratch/goezsoy/nlp_lss_datasets/processed_df_train.csv')

In [3]:
df = pd.read_pickle('tokenized-records.pkl')

df = df[['bioguide_id','birthday','gender','term_type','term_start','term_end','term_state','term_party','speech']]

# preprocessing: eleminate duplicate speeches, and remove '/n'
unique_speeches_nested = df.groupby('bioguide_id')['speech'].agg(lambda x: list(set(x.values)))

unique_speeches_single_list = []
for idx,row in unique_speeches_nested.reset_index(drop=False).iterrows():
    for temp_speech in row.speech:
        temp_speech = temp_speech.replace('\n','')
        unique_speeches_single_list.append((row.bioguide_id,temp_speech))

unique_speeches_single_df = pd.DataFrame(unique_speeches_single_list,columns=['bioguide_id','speech'])

# row[0]!='.' speeches are not speech but text snippets describing the law 
processed_df = unique_speeches_single_df[unique_speeches_single_df.speech.apply(lambda row: row[0]=='.')]

# add speaker metadata to processed speeches using pd.merge
df = df.drop('speech',axis=1)
df = df.drop_duplicates(subset='bioguide_id')
processed_df= processed_df.merge(df,on='bioguide_id',how='inner')

### Analysis

In [4]:
processed_df['token_count'] = processed_df['speech'].apply(lambda row: len(row.split()))

In [5]:
# word count statistics by political parties
processed_df.groupby('term_party')['token_count'].describe(percentiles=[0.25, 0.5, 0.75, 0.95, 0.99])

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,95%,99%,max
term_party,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Democrat,110881.0,466.130924,3056.491393,2.0,37.0,167.0,477.0,1639.0,3588.0,364497.0
Independent,1610.0,664.330435,1396.209942,5.0,37.0,189.5,817.5,2573.65,4496.35,23765.0
Republican,95731.0,441.777105,2327.703812,2.0,35.0,160.0,432.0,1672.5,3687.7,373631.0


In [6]:
# word count statistics by gender
processed_df.groupby('gender')['token_count'].describe(percentiles=[0.25, 0.5, 0.75, 0.95, 0.99])

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,95%,99%,max
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
F,37039.0,465.252896,2907.392255,2.0,45.0,189.0,497.0,1506.1,3210.48,286216.0
M,171183.0,454.565564,2696.512727,2.0,35.0,158.0,447.0,1697.9,3730.0,373631.0


In [7]:
# word count statistics by Senate or House
processed_df.groupby('term_type')['token_count'].describe(percentiles=[0.25, 0.5, 0.75, 0.95, 0.99])

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,95%,99%,max
term_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
H,123861.0,380.770606,3224.614615,2.0,40.0,166.0,365.0,986.0,2953.4,373631.0
S,84361.0,567.60554,1782.751186,2.0,32.0,156.0,711.0,2236.0,4126.4,328056.0


In [8]:
# word count statistics by state sorted with descending record size
processed_df.groupby('term_state')['token_count'].describe(percentiles=[0.25, 0.5, 0.75, 0.95, 0.99]).sort_values(by='count',ascending=False)

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,95%,99%,max
term_state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
CA,19853.0,397.334055,2169.36407,3.0,40.0,163.0,402.0,1162.4,3565.08,186901.0
TX,13241.0,556.848501,2993.248709,4.0,40.0,178.0,498.0,2055.0,5149.0,217282.0
NY,9761.0,505.963528,5762.826539,5.0,32.0,162.0,426.0,1294.0,2671.0,328056.0
KY,8313.0,308.711175,1552.368602,3.0,26.0,69.0,303.0,1191.2,2470.68,119516.0
IL,7764.0,495.067491,1088.918112,5.0,37.0,168.0,524.0,2139.0,4021.0,38728.0
NV,7747.0,243.233639,1074.516408,4.0,28.0,65.0,229.0,985.1,1929.86,63625.0
MI,7573.0,394.971874,2591.983667,4.0,27.0,118.0,356.0,1190.2,2911.36,146282.0
FL,6559.0,345.18143,857.064557,2.0,52.0,174.0,391.0,1139.1,2722.92,41511.0
MD,6404.0,502.716271,4719.900858,2.0,43.0,194.0,533.0,1600.0,3277.97,364497.0
NJ,5981.0,471.429025,5321.731187,3.0,37.0,160.0,373.0,1357.0,3211.0,373631.0


In [9]:
# keep all until 99th percentile (removing outliers)
processed_df = processed_df[processed_df['token_count'] <= 3646.790000]
pd.to_pickle(processed_df,'processed_df.pkl')

In [16]:
processed_df.bioguide_id.value_counts()

R000146    6438
M000355    6437
D000563    3543
M000312    2151
L000174    2049
           ... 
K000180       1
C001092       1
C001089       1
C001066       1
U000038       1
Name: bioguide_id, Length: 882, dtype: int64

In [17]:
import spacy
nlp = spacy.load('en_core_web_sm')

from gensim.corpora import Dictionary

dfs = processed_df[processed_df.bioguide_id=='L000174']
dfs['cleaned_lda'] = dfs['speech'].apply(lambda row: [t.lower_ for t in nlp(row) if not t.is_punct and not t.is_stop])
dictionary = Dictionary(dfs['cleaned_lda'])
doc_term_matrix = [dictionary.doc2bow(doc) for doc in dfs['cleaned_lda']]

import pyLDAvis.gensim
pyLDAvis.enable_notebook()
from gensim.models.ldamodel import LdaModel
lda = LdaModel(doc_term_matrix, num_topics=10, id2word = dictionary, passes=3)
pyLDAvis.gensim.prepare(lda, doc_term_matrix, dictionary)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfs['cleaned_lda'] = dfs['speech'].apply(lambda row: [t.lower_ for t in nlp(row) if not t.is_punct and not t.is_stop])
  default_term_info = default_term_info.sort_values(
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
