<a href="https://colab.research.google.com/github/david-j-cox/twitter-higher-ed/blob/main/basic_analyses.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Packages and modules

Coding is sped up by using code and algorithms that others have already written so we don't have to reinvent the wheel. This next box imports some of the pre-existing we'll use. 

In [2]:
# Packages for path management
import os

# Packages for manipulating data
import numpy as np
from numpy import std, mean, sqrt
import pandas as pd

# Packages for visualizing data
import seaborn as sns 
import matplotlib.pyplot as plt
try:
  import sweetviz as sv
except:
    !pip install sweetviz
    import sweetviz as sv

# Packages for statistics and modeling
from scipy import stats
try:
  from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
except:
  !pip install vaderSentiment

# Read in raw data set

In [3]:
# Set the working directory to the corresponding repository
directory = '/Users/davidjcox/Dropbox (Personal)/Projects/Manuscripts In Progress/Empirical/Endicott Data/diversity-survey/'
os.chdir(directory)

# Read in data and make a copy so we don't accidentally damage the raw file
raw_data = pd.read_csv('./data/01_raw/raw_data.csv')
data = raw_data.copy()

In [9]:
# Take a quick peek at every 100th row of the dataframe to make sure nothing looks off
data[::100]

Unnamed: 0,Time,Consent,Q1 of 20. What is your age?,Q2 of 20. With which gender do you identify?,Q3 of 20. What pronouns do you prefer?,Q4 of 20. With which ethnicity/race do you identify? Please list all that apply.,"Q5 of 20. With which religion do you identify? Please list all that apply, including if you do not identify with a religion.",Q6 of 20. With which political affiliation do you identify?,"Q7 of 20. What is the highest level of education you have completed? (If currently enrolled, please select highest degree received):",Please specify other terminal degree,...,Q18 of 20. Please rate your familiarity with the following terms: (Judaism),Q18 of 20. Please rate your familiarity with the following terms: (No religion (Atheist)),Q18 of 20. Please rate your familiarity with the following terms: (Paganism),Q18 of 20. Please rate your familiarity with the following terms: (Wiccan),Q19 of 20. What actionable items do you recommend we take as a field to increase diversity?,Q20 of 20. Please include any comments or feedback on how the demographic section of this survey can be improved to meet its goal of inclusivity and diversity.,Browser,IP Address,Unique ID,Location
0,10/21/20 12:30,"Yes, I am at least 18 years of age, am a stude...",46-60,Male,He/him/his,Jewish,,Libertarian,Doctoral degree,,...,Very familiar,Very familiar,Very familiar,Very familiar,"None, treat everyone as an individual and not ...",,Chrome 86.0.4240.75 / Windows,100.12.9.146,682348249,"40.620098114014, -73.751899719238"
100,10/28/20 12:27,"Yes, I am at least 18 years of age, am a stude...",31-45,male,He/him/his,White,,Democratic Socialist/Democratic Party,Master's degree,,...,Very familiar,Very familiar,Somewhat familiar,Somewhat familiar,Invest in recruiting marginalized population...,,Safari 13.1.2 / OS X,74.85.93.162,688774433,"47.614498138428, -122.34799957275"
200,3/9/21 17:30,"Yes, I am at least 18 years of age, am a stude...",31-45,Male,He/him/his,Caucasion,I do not identify with a religion or any spiri...,Democrat/moderate liberal,Bachelor's degree,,...,Somewhat familiar,Very familiar,Very familiar,Somewhat unfamiliar,I think the field of ABA is new enough that ma...,,Chrome 88.0.4324.182 / Windows,205.118.194.18,776160429,"40.498199462891, -111.84359741211"
300,3/9/21 23:45,"Yes, I am at least 18 years of age, am a stude...",18-30,Female,She/her/hers,Mixed White and Black Caribbean,,UK - left wing Green Party/labour,Master's degree,,...,Very familiar,Very familiar,Somewhat familiar,Somewhat familiar,Ensure that everyone gets 'a seat at the table...,It was great that fields were left open so I c...,Mobile Safari 12.1.2 / iOS,94.204.111.63,776275507,"24, 54"
400,3/20/21 19:01,"Yes, I am at least 18 years of age, am a stude...",31-45,Female,She/her/hers,White,,Independent,Master's degree,,...,Very familiar,Very familiar,Very familiar,Very familiar,Looking at socioeconomic background and making...,,Mobile Safari 14.0.3 / iOS,69.207.101.121,781294861,"43.224601745605, -77.592002868652"


## When programming, we often have to specify the column we're interested in looking at. The current column headers are very wordy. We'll rename the column headers to make them easier to work with. 

In [10]:
# Take a look at the existing column headers
list(data)

['Time',
 'Consent',
 'Q1 of 20. What is your age?',
 'Q2 of 20. With which gender do you identify?',
 'Q3 of 20. What pronouns do you prefer?',
 'Q4 of 20. With which ethnicity/race do you identify? Please list all that apply.',
 'Q5 of 20. With which religion do you identify? Please list all that apply, including if you do not identify with a religion. ',
 'Q6 of 20. With which political affiliation do you identify?',
 'Q7 of 20. What is the highest level of education you have completed? (If currently enrolled, please select highest degree received):',
 'Please specify other terminal degree',
 'Q8 of 20. What is your annual household income?',
 'Q9 of 20. Do you identify as having a disability?',
 'If yes and you feel comfortable sharing, please specify:',
 'Q10 of 20. Please select applicable certification and/or licenses (select all that apply):',
 'Q11 of 20. How long have you identified as being in the field of behavior analysis?',
 'Q12 of 20. How long have you been certified an

In [11]:
# Use that text to create shorthand labels
data.columns = []'time', 
               'consent', 
               'age', 
               'gender', 
               'pronouns', 
               'ethnicity', 
               'religion', 
               'political_affil', 
               'education', 
               'degree_name', 
               'income', 
               'disability', 
               'disability_name', 
               'cert_license', 
               'time_in_bx_anal', 
               'time_cert_license', 
               'country_live', 
               'province', 
               'state', 
               'define_diversity', 
               'familiar_cis_fem', 
               'familiar_cis_male', 
               'familiar_trans_fem', 
               'familiar_trans_male', 
               'familiar_non_binary', 
               'familiar_gender_fluid', 
               'familiar_gender_neutral', 
               'familiar_asian', 
               'familiar_black', 
               'familiar_white', 
               'familiar_latinx', 
               'familiar_nat_amer', 
               'familiar_pac_isl', 
               'familiar_agnostic', 
               'familiar_amish', 
               'familiar_buddhism', 
               'familiar_christian', 
               'familiar_hinduism', 
               'familiar_islam', 
               'familiar_jehovah', 
               'familiar_judaism', 
               'familiar_atheist', 
               'familiar_paganism', 
               'familiar_wiccan', 
               'actions_recommend', 
               'survey_feedback', 
               'browser',
               'ip_address',
               'unique_id',
               'location']

ValueError: Length mismatch: Expected axis has 53 elements, new values have 50 elements

In [None]:
# Convert all the text to lowercase for string matching
raw_data_pre['content'] = raw_data_pre['content'].str.lower()

In [None]:
# Create a list of the indices containing Tweets with COVID or higher-education
int_keep_pre = []

for i in range(len(raw_data_pre)):
  val = str(raw_data_pre['content'][i])
  for j in hied_list:
    if j in val:
      if i in int_keep_pre:
        continue
      else:
        int_keep_pre.append(i)

int_keep_pre = pd.DataFrame(int_keep_pre)
int_keep_pre.columns = ['index']
int_keep_pre.to_csv('./02_intermediate/ints_pre.csv')
print("raw total:", len(raw_data_pre))

# Use the list of the indices to trim the raw df
df_pre = raw_data_pre.iloc[int_keep_pre['index']]
print("subset total: ", len(df_pre))
df_pre = df_pre.reset_index(drop=True)
df_pre.to_csv('./02_intermediate/clean_pre.csv')

In [None]:
# Read in post-COVID tweets, drop duplicates and save
raw_data_post = pd.read_csv('./01_raw/post_covid_learning.csv')
print("Pre: ", len(raw_data_post))
raw_data_post = raw_data_post.drop_duplicates()
print("Post: ", len(raw_data_post))
raw_data_post.to_csv('./01_raw/post_covid_learning.csv')

In [None]:
# Convert all the text to lowercase for string matching
raw_data_post['content'] = raw_data_post['content'].str.lower()

In [None]:
# Create a list of the indices containing Tweets with COVID or higher-education
int_keep_post = []

for i in range(len(raw_data_post)):
  val = str(raw_data_post['content'][i])
  for j in hied_list:
    for k in cov_list:
      if j in val:
        if i in int_keep_post:
          continue
        else:
          int_keep_post.append(i)
      if k in val:
        if i in int_keep_post:
          continue
        else:
          int_keep_post.append(i)

int_keep_post = pd.DataFrame(int_keep_post)
int_keep_post.columns = ['index']
int_keep_post.to_csv('./02_intermediate/ints_post.csv')
print("raw total:", len(raw_data_post))

# Use the list of the indices to trim the raw df
df_post = raw_data_post.iloc[int_keep_post['index']]
print("subset total: ", len(df_post))
df_post = df_post.reset_index(drop=True)
df_post.to_csv('./02_intermediate/clean_post.csv')

In [None]:
# Combine the pre and post dfs into a single dataframe
all_data = df_pre.append(df_post)
all_data = all_data.drop(['Unnamed: 0', 'Unnamed: 0.1'], axis=1)
all_data = all_data.reset_index(drop=True)
all_data.to_csv('./02_intermediate/all_data.csv')

# Sentiment Analysis using Vader model

In [None]:
# Add sentiment scores from Vader Model
san = SentimentIntensityAnalyzer()
pos = []
neu = []
neg = []
compound = []

for i in range(len(all_data)):
  sent_dict = san.polarity_scores(all_data['content'][i])
  pos.append(sent_dict.get('pos'))
  neu.append(sent_dict.get('neu'))
  neg.append(sent_dict.get('neg'))
  compound.append(sent_dict.get('compound'))

all_data['vader_pos'] = pos
all_data['vader_neu'] = neu
all_data['vader_neg'] = neg
all_data['vader_com'] = compound

In [None]:
# Quick plots of Vader sentiment before and after
fig, axs = plt.subplots(2, 2, figsize=(10, 10))
sns.violinplot(x=all_data['year'], y=all_data['vader_pos'], ax=axs[0, 0], palette="BrBG")
sns.violinplot(x=all_data['year'], y=all_data['vader_neu'], ax=axs[0, 1], palette="BrBG")
sns.violinplot(x=all_data['year'], y=all_data['vader_neg'], ax=axs[1, 0], palette="BrBG")
sns.violinplot(x=all_data['year'], y=all_data['vader_com'], ax=axs[1, 1], palette="BrBG")
plt.savefig(fname='./figures/violin_vader.png', bbox_inches='tight')
plt.show()

In [None]:
# Descriptions of sentiment by year
vader_desc = pd.DataFrame(all_data['vader_pos'][all_data['year']==2020].describe())
vader_desc.columns = ['pos_2020']

vader_desc['pos_2021'] = all_data['vader_pos'][all_data['year']==2021].describe()

for i in ['vader_neu', 'vader_neg', 'vader_com']:
  for year in [2020, 2021]:
    name = i[-3:]+"_"+str(year)
    vader_desc[name] = all_data[i][all_data['year']==year].describe()

vader_desc.to_csv('./03_primary/vader_descriptions.csv')
vader_desc

In [None]:
# Statistics on the differences
comparison = []
stat = []
p_val = []
coh_d = []

for i in ['vader_pos', 'vader_neu', 'vader_neg', 'vader_com']:
  comparison.append(i)
  ttest = stats.ttest_ind(all_data[i][all_data['year']==2020], all_data[i][all_data['year']==2021])
  stat.append(ttest[0])
  p_val.append(ttest[1])
  c0 = all_data[i][all_data['year']==2020]
  c1 = all_data[i][all_data['year']==2021]
  coh_d.append((mean(c0) - mean(c1)) / (sqrt((std(c0) ** 2 + std(c1) ** 2) / 2)))

vader_stats = pd.DataFrame([comparison, stat, p_val, coh_d])
vader_stats = vader_stats.transpose()
vader_stats.columns = ['comparison', 't_stat', 'p_val', 'cohens_d']
vader_stats.to_csv('./03_primary/vader_stats.csv')
vader_stats

# Sentiment Analysis using TextBlob

In [None]:
# Add sentiment scores from NB and Pattern Model
polarity = []
subjectivity = []

for i in range(len(all_data)):
  sent_dict = TextBlob(all_data['content'][i]).sentiment
  polarity.append(sent_dict[0])
  subjectivity.append(sent_dict[1])

all_data['text_blob_polarity'] = polarity
all_data['text_blob_subjectivity'] = subjectivity

In [None]:
# Quick plots of TextBlob sentiment before and after
fig, axs = plt.subplots(1, 2, figsize=(10, 5))
sns.violinplot(x=all_data['year'], y=all_data['text_blob_polarity'], ax=axs[0], palette="BrBG")
sns.violinplot(x=all_data['year'], y=all_data['text_blob_subjectivity'], ax=axs[1], palette="BrBG")
plt.savefig(fname='./figures/violin_textblob.png', bbox_inches='tight')
plt.show()

In [None]:
# Descriptions of sentiment by year
txblb_desc = pd.DataFrame(all_data['text_blob_polarity'][all_data['year']==2020].describe())
txblb_desc.columns = ['txblb_polarity_2020']

txblb_desc['txblb_polarity_2021'] = all_data['text_blob_polarity'][all_data['year']==2021].describe()
txblb_desc['txblb_subjectivity_2020'] = all_data['text_blob_subjectivity'][all_data['year']==2020].describe()
txblb_desc['txblb_subjectivity_2021'] = all_data['text_blob_subjectivity'][all_data['year']==2021].describe()

txblb_desc.to_csv('./03_primary/textblob_descriptions.csv')
txblb_desc

In [None]:
# Statistics on the differences
comparison = []
stat = []
p_val = []
coh_d = []

for i in ['text_blob_polarity', 'text_blob_subjectivity']:
  comparison.append(i)
  ttest = stats.ttest_ind(all_data[i][all_data['year']==2020], all_data[i][all_data['year']==2021])
  stat.append(ttest[0])
  p_val.append(ttest[1])
  c0 = all_data[i][all_data['year']==2020]
  c1 = all_data[i][all_data['year']==2021]
  coh_d.append((mean(c0) - mean(c1)) / (sqrt((std(c0) ** 2 + std(c1) ** 2) / 2)))

txblb_stats = pd.DataFrame([comparison, stat, p_val, coh_d])
txblb_stats = txblb_stats.transpose()
txblb_stats.columns = ['comparison', 't_stat', 'p_val', 'cohens_d']
txblb_stats.to_csv('./03_primary/txblb_stats.csv')
txblb_stats

In [None]:
# Correlation matrix
def corrdot(*args, **kwargs):
    corr_r = args[0].corr(args[1], 'pearson')
    corr_text = f"{corr_r:2.2f}".replace("0.", ".")
    ax = plt.gca()
    ax.set_axis_off()
    marker_size = abs(corr_r) * 10000
    ax.scatter([.5], [.5], marker_size, [corr_r], alpha=0.6, cmap="coolwarm", \
               vmin=-1, vmax=1, transform=ax.transAxes)
    font_size = abs(corr_r) * 40 + 5
    ax.annotate(corr_text, [.5, .5,],  xycoords="axes fraction", ha='center', \
                va='center', fontsize=font_size)

def annotate_colname(x, **kws):
    ax = plt.gca()
    ax.annotate(x.name, xy=(0.05, 0.9), xycoords=ax.transAxes, fontweight='bold')

def plot_corr(df, title, save_name):
  sns.set(style='white', font_scale=1.6)
  g = sns.PairGrid(df, aspect=1.4, diag_sharey=False)
  g.fig.set_size_inches(20,20)
  g.map_lower(sns.regplot, scatter_kws={'s':5,'alpha':0.1})
  g.map_diag(sns.distplot, kde_kws={'color': 'black'})
  g.map_diag(annotate_colname)
  g.map_upper(corrdot)
  plt.title(title, fontsize=26) 
  plt.savefig(fname=f'./figures/{save_name}.png', bbox_inches='tight')
  plt.show()

In [None]:
plot_corr(df=all_data[['vader_com', 'text_blob_polarity']], 
          title='All Data', 
          save_name='sentiment_corr_matrix')
plot_corr(df=all_data[['vader_com', 'text_blob_polarity']][all_data['year']==2020], 
          title='2020', 
          save_name='sent_corr_matrix_2020')
plot_corr(df=all_data[['vader_com', 'text_blob_polarity']][all_data['year']==2021], 
          title='2021', 
          save_name='sent_corr_matrix_2021')

In [None]:
all_data.to_csv('./03_primary/all_data.csv')

# EDA and comparing dataframes

In [None]:
# If picking up fresh
all_data= pd.read_csv('./02_intermediate/all_data.csv').drop(['Unnamed: 0'], axis=1)
all_data[::15000]

In [None]:
dabl_df = dabl.clean(all_data, verbose=1)

## DABL Plots

In [None]:
# All data
dabl.plot(dabl_df, target_col='vader_com')

## Sweetviz for comparisons

In [None]:
# Sweetviz for all data
all_report = sv.analyze(dabl_df)
all_report.show_html(filepath='SWEETVIZ_REPORT.html', 
                     open_browser=True, 
                     layout='widescreen', 
                     scale=None)

In [None]:
# 2020 vs 2021
df_20 = dabl_df[dabl_df['year']=='2020'].reset_index(drop=True)
df_21 = dabl_df[dabl_df['year']=='2021'].reset_index(drop=True)
compare_report = sv.compare([df_20, "Pre-COVID"], [df_21, "Post-COVID"])
compare_report.show_html()

# Topic Model

In [None]:
trim_data = all_data[['date', 'content', 'id', 'replyCount', 
                      'retweetCount', 'likeCount', 'quoteCount', 
                      'conversationId', 'lang', 'sourceLabel', 
                      'hashtags', 'time', 'year', 'vader_pos', 
                      'vader_neu', 'vader_neg', 'vader_com', 
                      'text_blob_polarity', 'text_blob_subjectivity']]
trim_data.head()