# Computational Social Science: Exploratory Data Analysis and Unsupervised Methods

This lab will demonstrate some exploratory methods for finding separating words, and introduce unsupervised topic models.

In [None]:
#!pip install scattertext
#!pip install wordcloud

In [1]:
import pandas as pd
import numpy as np
from PIL import Image
import spacy
import en_core_web_sm
import scattertext as st
nlp = en_core_web_sm.load()
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from textblob import TextBlob
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

## Data

<img src = "../../images/cfpb logo.png"  />

We'll once again use the Consumer Financial Protection Bureau's [Consumer Complaint Database](https://www.consumerfinance.gov/data-research/consumer-complaints/). This time, we are going to focus on figuring out whether we can find text features that help distinguish different "Products." There are several products represented in the dataset:

In [2]:
cfpb = pd.read_csv("../../data/CFPB 2020 Complaints.csv")
cfpb = cfpb.dropna(subset = ['Consumer complaint narrative'])
cfpb['Product'].unique()

array(['Credit reporting, credit repair services, or other personal consumer reports',
       'Debt collection', 'Checking or savings account',
       'Money transfer, virtual currency, or money service',
       'Payday loan, title loan, or personal loan',
       'Credit card or prepaid card', 'Mortgage', 'Vehicle loan or lease',
       'Student loan'], dtype=object)

For the first few exercises, we will focus on mortgages and student loans. We will also just use the first one thousand observations so that the code runs faster. 

In [3]:
cfpb = cfpb[(cfpb['Product']=='Mortgage') | (cfpb['Product'] == 'Student loan')]
cfpb = cfpb[:1000]

In [4]:
cfpb.head()

Unnamed: 0,Date received,Product,Sub-product,Issue,Sub-issue,Consumer complaint narrative,Company public response,Company,State,ZIP code,Tags,Consumer consent provided?,Submitted via,Date sent to company,Company response to consumer,Timely response?,Consumer disputed?,Complaint ID
213,01/20/20,Mortgage,Conventional home mortgage,Struggling to pay mortgage,,"I have been contacting OCWEN MORTGAGE, PHH MOR...",Company believes it acted appropriately as aut...,Ocwen Financial Corporation,LA,,Servicemember,Consent provided,Web,02/18/20,Closed with explanation,Yes,,3502248
216,01/15/20,Mortgage,Conventional home mortgage,Struggling to pay mortgage,,We purchased our house in XX/XX/XXXX the Assig...,,Ocwen Financial Corporation,ID,,,Consent provided,Web,02/10/20,Closed with explanation,Yes,,3497147
283,03/28/20,Mortgage,VA mortgage,Closing on a mortgage,,I had past issues with Mortgage Solutions of C...,,MORTGAGE SOLUTIONS OF COLORADO,TX,,Servicemember,Consent provided,Web,04/01/20,Closed with explanation,Yes,,3583812
293,03/31/20,Mortgage,VA mortgage,Trouble during payment process,,I have called Nationstar multiple times to req...,Company believes it acted appropriately as aut...,NATIONSTAR MORTGAGE,MN,551XX,Servicemember,Consent provided,Web,03/31/20,Closed with explanation,Yes,,3586865
387,03/31/20,Mortgage,VA mortgage,Struggling to pay mortgage,,All documents for the short sale review were s...,Company believes it acted appropriately as aut...,"Shellpoint Partners, LLC",VA,,,Consent provided,Web,03/31/20,Closed with explanation,Yes,,3588118


## Preprocessing

Let's start by creating our tokens. We'll use the same `rem_punc_stop()` function we defined last time.

In [5]:
def rem_punc_stop(text):
    stop_words = STOP_WORDS
    punc = set(punctuation)
    
    punc_free = "".join([ch for ch in text if ch not in punc])
    
    doc = nlp(punc_free)
    
    spacy_words = [token.text for token in doc]
    
    no_punc = [word for word in spacy_words if word not in stop_words]
    
#    no_punc = ' '.join(no_punc)
    return no_punc

Notice here how we use the `map()` function to apply our `rem_punc_stop()` function to every row of our dataframe. `map()` is typically much faster than writing a for loop, though there are also faster options like [list comprehensions](https://docs.python.org/3/tutorial/datastructures.html) and vectorized numpy arrays.

In [None]:
cfpb['tokens'] = cfpb['Consumer complaint narrative'].map(lambda x: rem_punc_stop(x))
cfpb['tokens']

## Word Cloud

In [None]:
text = cfpb['tokens'].map(lambda text: ' '.join(text))
type(text[213])
#cfpb['tokens'].map(lambda text: TextBlob(text).sentiment.polarity)

In [None]:
text = ' '.join(cfpb['tokens'].map(lambda text: ' '.join(text)))
wordcloud = WordCloud().generate(text)
plt.imshow(wordcloud, interpolation = 'bilinear')
plt.axis('off')
plt.show()

In [None]:
text = ' '.join(cfpb['tokens'].map(lambda text: ' '.join(text)))
wordcloud = WordCloud(background_color = "white").generate(text)
plt.imshow(wordcloud, interpolation = 'bilinear')
plt.axis('off')
plt.show()

In [None]:
cfpb_mask = np.array(Image.open("../../images/cfpb logo.png"))
text = ' '.join(cfpb['tokens'].map(lambda text: ' '.join(text)))
wordcloud = WordCloud(background_color = "white", 
                     mask = cfpb_mask,
                     width = 1000,
                     height = 1000).generate(text)
plt.imshow(wordcloud, interpolation = 'bilinear')
plt.axis('off')
plt.show()

## Lengths and Counts

In [None]:
cfpb['complaint_len'] = cfpb['Consumer complaint narrative'].astype(str).apply(len)
cfpb['word_count'] = cfpb['Consumer complaint narrative'].apply(lambda x: len(str(x).split()))

In [None]:
sns.displot(cfpb, x="complaint_len")
plt.show()

In [None]:
sns.displot(cfpb, x="word_count")
plt.show()

In [None]:
sns.displot(cfpb, x="word_count", hue = "Product", col = "Product")
plt.show()

## Sentiment Polarity

In [None]:
cfpb['tokens'] = cfpb['tokens'].map(lambda text: ' '.join(text))
cfpb['polarity'] = cfpb['tokens'].map(lambda text: TextBlob(text).sentiment.polarity)
cfpb.head()

In [None]:
sns.displot(cfpb, x="polarity")
plt.show()

In [None]:
for complaint in cfpb.nlargest(5, 'polarity')['Consumer complaint narrative']:
    print(complaint + "\n")

In [None]:
for complaint in cfpb.nsmallest(5, 'polarity')['Consumer complaint narrative']:
    print(complaint + '\n')

In [None]:
sns.displot(cfpb, x="polarity", hue = "Product", col = "Product")
plt.show()

## ScatterText

In [None]:
corpus = st.CorpusFromPandas(cfpb[:500],
                            category_col = 'Product',
                            text_col = 'tokens',
                            nlp = nlp).build()

In [None]:
html = st.produce_scattertext_explorer(corpus,
                                       category='Student loan',
                                       category_name='Student',
                                       not_category_name='Not Student',
                                        width_in_pixels=1000,
                                        minimum_term_frequency=5,
                                        transform=st.Scalers.scale)
#                                        metadata=str(cfpb['Complaint ID']))

In [None]:
open("CFPB Sentiment.html", 'wb').write(html.encode('utf-8'))

## Unsupervised Methods

In [None]:
cfpb = pd.read_csv("../../data/CFPB 2020 Complaints.csv")
cfpb = cfpb.dropna(subset = ['Consumer complaint narrative'])
cfpb = cfpb[(cfpb['Product']=='Checking or savings account') | (cfpb['Product'] == 'Student loan')]
cfpb = cfpb[:1000]

In [None]:
# X
X = cfpb['Consumer complaint narrative']
tf = TfidfVectorizer(tokenizer = rem_punc_stop)

tfidf_matrix =  tf.fit_transform(X)

In [None]:
dense_matrix = tfidf_matrix.todense()

In [None]:
lda = LatentDirichletAllocation(n_components=5, max_iter=20, random_state=0)
lda = lda.fit(dense_matrix)

In [None]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("\nTopic #{}:".format(topic_idx))
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

In [None]:
tf_feature_names = tf.get_feature_names()
print_top_words(lda, tf_feature_names, 20)

## Topic weights

One thing we may want to do with the output is compare the prevalence of each topic across documents. A simple way to do this (but not memory efficient), is to merge the topic distribution back into the Pandas dataframe.

First get the topic distribution array.

In [None]:
topic_dist = lda.transform(tfidf_matrix)
topic_dist

Merge back with original dataframe

In [None]:
topic_dist_df = pd.DataFrame(topic_dist)
df_w_topics = topic_dist_df.join(cfpb.reset_index())
df_w_topics

Now we can chech the average weight of each topic across gender using `groupby`.

In [None]:
grouped = df_w_topics.groupby('Product')
grouped[2].mean().sort_values(ascending=False)

In [None]:
sns.displot(df_w_topics, x=df_w_topics[2], hue = "Product", kind = 'kde', fill = 'true')
plt.show()