***
*Homework 3: Knoll et al. (2015) + STM*
***
**Code Author:** Lan Luo  
**Course:** Probabilistic Models and Machine Learning (Fall 2022)  
**Professor:** David Blei
<br>  
This code processes data and cleans text related to thoughts that come to mind for retirement benefit claiming decisions.

# Initialize Environment

## Set File Path

In [4]:
# set paths
base_path = 'G:/My Drive/Columbia Files'
path = f'{base_path}/Coursework/2022-2023 Fall/Graphical Models/Homework 3'
## for my helper functions
helper_path = f'{base_path}/Research/- helper'

# set today's date
from datetime import date
str_date = date.today()

## Packages

In [5]:
# basic essentials
import numpy as np
import pandas as pd
from itertools import chain
from collections import Counter
import ujson as json
import random, sys, gzip, pickle, os, gc, itertools, time, importlib, re, scipy

# data visualization
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import colors
sys.path.append(helper_path)
from lan_vis import *
importlib.reload(sys.modules['lan_vis'])
%matplotlib inline
vis_style('light')

# text processing
from lan_text import *
importlib.reload(sys.modules['lan_text'])
from gensim import corpora
from gensim.models import LdaMulticore, LdaModel
from gensim.test.utils import datapath
from sklearn.model_selection import train_test_split

# computer vision
from lan_cv import *
importlib.reload(sys.modules['lan_cv'])

<module 'lan_cv' from 'G:/My Drive/Columbia Files/Research/- helper\\lan_cv.py'>

# Data

In [6]:
# import
data = pd.read_csv(f"{path}/1 SSA dataset; Study3; Nov 2021.csv").dropna(subset=['study'])

In [7]:
# all text aspects
aspects = [x for x in data if x.startswith('aspect_text')]
# subset cols
data = data[['S3_NATURAL_ORDER', 'CLAIM_AGE', 'PID', 'text_integrated',
             'AGE', 'FEMALE', 'COLIVING', 'Kids', 'INCOME', 'EDUC', 'NUMERACY', 'SAVINGS', 'SMOKER', 'Whatisyourpoliticalaffiliation',
             'RaceBlackorAfricanAmerican', 'RaceAmericanIndianorAlaskanNative', 'RaceWhite', 
             'RaceHispanic', 'RaceAsian', 'RaceHawaiianorPacificIslander', 'RaceOther',
             'ELIGIBLE', 'SUBJ_HEALTH', 'SUBJ_LONG_RISK'] + aspects]

# rename cols
data.columns = [x.lower() for x in data.columns]
data = data.rename(columns={"s3_natural_order": "treatment", "pid": "ID", "text_integrated": "text_full",
                            "educ":"education", "whatisyourpoliticalaffiliation": "politics", 
                            "raceblackorafricanamerican": "black", "raceamericanindianoralaskannative": "amerindian_alaskan", 
                            "racewhite": "white", "racehispanic": "hispanic", "raceasian": "asian", 
                            "racehawaiianorpacificislander": "hawaii_pacific", "raceother": "other", 
                            "eligible": "benefit_eligible", "subj_health": "perc_health", "subj_long_risk": "life_expectancy"})

# adjust missing values encoding
data = data.replace('#NULL!', np.nan)

# drop if no text
data = data.dropna(subset=['text_full'])

In [8]:
# redefine treatment (so that 1 = unnatural order)
data['treatment'] = np.abs(data['treatment'] - 1)

In [9]:
# combine ethnicity columns
data['ethnicity'] = np.nan
ethnic_list = ['black', 'amerindian_alaskan', 'white', 'hispanic', 'asian', 'hawaii_pacific', 'other']
for ethnic in ethnic_list:
    data['ethnicity'] = np.where(data[ethnic]=="Yes", ethnic, data['ethnicity'])
    
data = data.drop(columns = ethnic_list)

In [10]:
# reshape wide to long (so that each thought is one row)
data_long = pd.melt(data, id_vars=[x for x in data.columns if x not in aspects], value_vars=aspects)
# drop missing thoughts
data_long = data_long.dropna(subset=['value'])
# sort by participant ID
data_long = data_long.sort_values(by = ["ID", "variable"])

In [11]:
# add thought count per participant
data_long['thought_count'] = (data_long.groupby(["ID"])['variable'].transform('nunique'))
data_long = data_long.rename(columns={"value": "thought"})

In [12]:
# drop melted col
data_long = data_long.drop(columns=['variable'])
# reorder cols
data_long.insert(4, 'thought', data_long.pop("thought"))
data_long.insert(5, 'thought_count', data_long.pop("thought_count"))

# Clean Text

In [16]:
# clean description
data_long = clean_text(data_long, 'thought', exclude_num=False, reduction="stem",
                       keep_useful_stop=True, exclude_stop=True,
                       filter_freq=True, low_filter=2, common_filter=2)
# reorder cols
data_long.insert(6, 'thought_clean', data_long.pop("thought_clean"))

Lower casing...
Decoding html...
Expanding contractions...
Replacing punctuation with spaces...
Removing stop words...
	['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', '

In [17]:
# save as csv
data_long.to_csv("claims_long.csv", index=False)