# Import, Download, & Variable Statements

In [6]:
# Import & download statements
!git clone https://github.com/d-atallah/implicit_gender_bias.git
import pandas as pd
import string
import re
import joblib
from implicit_gender_bias import config as cf
import os

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,precision_score, recall_score, f1_score, accuracy_score,roc_curve, roc_auc_score,log_loss
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('wordnet')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

Cloning into 'implicit_gender_bias'...
remote: Enumerating objects: 70, done.[K
remote: Counting objects: 100% (70/70), done.[K
remote: Compressing objects: 100% (61/61), done.[K
remote: Total 70 (delta 24), reused 33 (delta 6), pack-reused 0[K
Receiving objects: 100% (70/70), 43.90 KiB | 6.27 MiB/s, done.
Resolving deltas: 100% (24/24), done.


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [7]:
# Variables
# Inputs
folder_path = cf.filepath()
csv_files = ['facebook_wiki_posts','facebook_wiki_responses','fitocracy_posts','fitocracy_responses','reddit_posts','reddit_responses','ted_responses','facebook_congress_posts','annotations','facebook_congress_responses']

# Outputs
annotations_output = folder_path+'annotations_combined.csv'
responses_combined_output = folder_path+'responses_combined.csv'
posts_combined_output = folder_path+'posts_combined.csv'
sources_combined_output = folder_path+'sources_combined_output.csv'
log_ngram_pred_output = folder_path+'log_ngram_preds.csv'

Mounted at /content/drive


# Load Source Data

In [8]:
# Raw Pandas Dataframes
facebook_wiki_posts_raw = pd.read_csv(folder_path+'facebook_wiki_posts.csv')
facebook_wiki_responses_raw = pd.read_csv(folder_path+'facebook_wiki_responses.csv')
fitocracy_posts_raw = pd.read_csv(folder_path+'fitocracy_posts.csv')
fitocracy_responses_raw = pd.read_csv(folder_path+'fitocracy_responses.csv')
reddit_posts_raw = pd.read_csv(folder_path+'reddit_posts.csv')
reddit_responses_raw = pd.read_csv(folder_path+'reddit_responses.csv')
ted_responses_raw = pd.read_csv(folder_path+'ted_responses.csv')
annotations_raw = pd.read_csv(folder_path+'annotations.csv')
facebook_congress_posts_raw = pd.read_csv(folder_path+'facebook_congress_posts.csv')
#facebook_congress_responses_raw = pd.read_csv(folder_path+'facebook_congress_responses.csv')

In [8]:
#load_dict = cf.load_df(filepath, annotations_raw, 'annotations')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# If data has already been created you can uncomment to load here

# Responses combined dataframe
#responses_combined = pd.read_csv(responses_combined_output)

# Count vectorizer and training/test data
#vectorizer_count = joblib.load(folder_path+'count_vectorizer.pkl')
#X_train_vcount = joblib.load(X_train_vectorized_filepath)
#X_test_vcount = joblib.load(X_test_vectorized_filepath)

  exec(code_obj, self.user_global_ns, self.user_ns)


# Source Data Cleaning

## Define Functions

In [9]:
def reset_dfs():
    """
    Resets global DataFrame variables to the raw datasets then creates an index column.
    """
    global facebook_wiki_posts, facebook_wiki_responses, fitocracy_posts, fitocracy_responses,reddit_posts, reddit_responses,ted_responses, annotations,facebook_congress_posts, facebook_congress_responses

    facebook_wiki_posts = facebook_wiki_posts_raw.reset_index()
    facebook_wiki_responses = facebook_wiki_responses_raw.reset_index()
    fitocracy_posts = fitocracy_posts_raw.reset_index()
    fitocracy_responses = fitocracy_responses_raw.reset_index()
    reddit_posts = reddit_posts_raw.reset_index()
    reddit_responses = reddit_responses_raw.reset_index()
    ted_responses = ted_responses_raw.reset_index()
    annotations = annotations_raw.reset_index()
    facebook_congress_posts = facebook_congress_posts_raw.reset_index()
    #facebook_congress_responses = facebook_congress_responses_raw.reset_index()

In [10]:
# Function to create UID
def create_id(df):
    """
    Concatenates the source and index field of a given dataframe to create a unique sourceID field.

    Parameters:
    - df (pd.DataFrame): The input DataFrame.
    """
    df['sourceID'] = df['source']+df['index'].astype(str)

In [11]:
def null_analysis(df, columns):
    """
    Analyzes missing values in specified columns of a DataFrame.

    Parameters:
    - df (pd.DataFrame): The input DataFrame.
    - columns (list): List of columns to analyze for missing values.

    Returns:
    - pd.DataFrame: DataFrame containing analysis results for each specified column.
    """

    #Loop through columns, calculate nulls
    results = []
    for column in columns:
        null_rows = df[column].isnull().sum()
        total_rows = df.shape[0]
        percent_null = (null_rows / total_rows) * 100

        # Append the results
        results.append({
            'column': column,
            'null rows': f'{null_rows:,.0f}',
            'total rows': f'{total_rows:,.0f}',
            'percent null': f'{percent_null:.5f}%'
        })

    # Create a DataFrame directly from the list of dictionaries
    null_values_df = pd.DataFrame(results)

    return null_values_df

In [12]:
def post_merge(post_df, response_df):
    """
    Drops common columns then merges post and response dataframes.

    Parameters:
    - post_df (pd.DataFrame): The posts DataFrame to be merged.
    - response_df (pd.DataFrame): The response DataFrame to be merged.

    Returns:
    - pd.DataFrame: DataFrame containing the merged dataframes.
    """
    temp_df = response_df.drop('op_gender', axis=1)
    merged = pd.merge(post_df, temp_df, on=['op_id', 'post_id'], how='inner')
    merged = merged.reset_index()
    return merged

## Annotations

In [13]:
# Reset dataframes to raw data
reset_dfs()

In [14]:
# Analyze null values
null_analysis(annotations, ['response_text','op_gender'])

Unnamed: 0,column,null rows,total rows,percent null
0,response_text,2,15352,0.01303%
1,op_gender,0,15352,0.00000%


In [15]:
# Map gender to binary indicator
annotations['op_gender_binary'] = annotations['op_gender'].map({'W': 0, 'M': 1})

In [16]:
# Drop NA text
annotations = annotations.dropna(subset=['response_text','op_gender_binary'])

In [17]:
# Write files to folder
annotations.to_csv(annotations_output, index=False)
annotations.head()

Unnamed: 0,index,source,op_gender,post_text,response_text,sentiment,relevance,op_gender_binary
0,0,facebook_wiki,W,Stopped by Fashion Week and got to hang with A...,You are Both Sweet Ashley Tisdale and Lauren C...,Positive,Poster,0
1,1,facebook_wiki,M,"Well guys, real progress is happening. I'm 50 ...",Give us the first page to read. ONE PAGE.,Mixed,Content,1
2,2,facebook_wiki,W,Tonight is going to be a good night #PerfectMo...,this is my city was there 2 weeks a go,Neutral,Content,0
3,3,facebook_wiki,M,I know grandma Gilmore is real proud of you ht...,if grizzly Adams had a beard.,Neutral,Content,1
4,4,facebook_wiki,W,#NEWS to KNOW this AM - Mayor Emanuel will mak...,"Good morning Lourdes, have a great day! Great ...",Positive,Irrelevant,0


## Responses Only

In [18]:
# Reset dataframes to raw data
reset_dfs()

In [19]:
# Create source field
facebook_wiki_responses['source'] = 'FW'
fitocracy_responses['source'] = 'F'
reddit_responses['source'] = 'R'
#facebook_congress_responses['source'] = 'FC'
ted_responses['source'] = 'T'

In [20]:
# Loop through sources and create UID
sources = [facebook_wiki_responses,fitocracy_responses,reddit_responses,ted_responses]#facebook_congress_responses
for source in sources:
    create_id(source)

In [21]:
# Union tables
responses_combined = pd.concat(sources, ignore_index=True)

In [22]:
# Analyze null values
null_analysis(responses_combined, ['response_text','op_gender'])

Unnamed: 0,column,null rows,total rows,percent null
0,response_text,216,12629961,0.00171%
1,op_gender,67,12629961,0.00053%


In [23]:
# Map gender to binary indicator
responses_combined['op_gender_binary'] = responses_combined['op_gender'].map({'W': 0, 'M': 1})

# Drop NA text
responses_combined = responses_combined.dropna(subset=['response_text','op_gender_binary'])

In [24]:
# Write files to folder
responses_combined.to_csv(responses_combined_output, index=False)
responses_combined.head()

Unnamed: 0,index,op_id,op_gender,post_id,responder_id,response_text,op_name,op_category,source,sourceID,responder_gender,subreddit,responder_gender_visible,op_gender_binary
0,0,11679984,M,0,Michelle,Is this watch going to make it to LaPorte county?,Byron Miranda,Wikipedia_American_television_news_anchors,FW,FW0,,,,1.0
1,1,11679984,M,0,Melissa,Anything for Wilmington area,Byron Miranda,Wikipedia_American_television_news_anchors,FW,FW1,,,,1.0
2,2,11679984,M,0,Darlene,"Thanks, please keep us posted.",Byron Miranda,Wikipedia_American_television_news_anchors,FW,FW2,,,,1.0
3,3,11679984,M,0,Cheryl,Thanks Byron,Byron Miranda,Wikipedia_American_television_news_anchors,FW,FW3,,,,1.0
4,4,11679984,M,0,Melissa,[[STICKER]],Byron Miranda,Wikipedia_American_television_news_anchors,FW,FW4,,,,1.0


## Posts Only

In [25]:
# Reset dataframes to raw data
reset_dfs()

In [26]:
# Create source field
facebook_wiki_posts['source'] = 'FW'
fitocracy_posts['source'] = 'F'
reddit_posts['source'] = 'R'
facebook_congress_posts['source'] = 'FC'

In [27]:
# Loop through sources and create UID
sources = [facebook_wiki_posts,fitocracy_posts,reddit_posts,facebook_congress_posts]
for source in sources:
    create_id(source)

In [28]:
# Union tables
posts_combined = pd.concat(sources, ignore_index=True)

In [29]:
# Map gender to binary indicator
posts_combined['op_gender_binary'] = posts_combined['op_gender'].map({'W': 0, 'M': 1})

# Drop NA text
posts_combined = posts_combined.dropna(subset=['post_text'])

In [30]:
# Write files to folder
posts_combined.to_csv(posts_combined_output, index=False)
posts_combined.head()

Unnamed: 0,index,op_id,op_gender,post_id,post_text,post_type,source,sourceID,subreddit,op_gender_visible,op_gender_binary
0,0,11679984,M,0,Tornado watch in effect tonight. Be safe. Plea...,photo,FW,FW0,,,1
2,2,11679984,M,2,Temps warming up for the holiday weekend! I wi...,photo,FW,FW2,,,1
3,3,11679984,M,3,Showers heading this way ..,photo,FW,FW3,,,1
4,4,11679984,M,4,Storm potential update...feel free to share th...,photo,FW,FW4,,,1
6,6,11679984,M,6,"Hello October! Temps will be chilly tomorrow, ...",photo,FW,FW6,,,1


## Merging Posts and Responses

In [31]:
# Reset dataframes to raw data
reset_dfs()

In [32]:
# Drop common columns
reddit_responses = reddit_responses.drop('subreddit', axis=1)

In [33]:
# Merge applicable dataframes
facebook_wiki_merged = post_merge(facebook_wiki_posts, facebook_wiki_responses)
fitocracy_merged = post_merge(fitocracy_posts, fitocracy_responses)
reddit_merged = post_merge(reddit_posts, reddit_responses)
#facebook_congress_merged = post_merge(facebook_congress_posts, facebook_congress_responses)

NameError: name 'facebook_congress_responses' is not defined

In [None]:
# Create source field
facebook_wiki_merged['source'] = 'FW'
fitocracy_merged['source'] = 'F'
reddit_merged['source'] = 'R'
facebook_congress_merged['source'] = 'FC'

In [None]:
# Loop through sources and create UID
sources = [facebook_wiki_merged,fitocracy_merged,reddit_merged,facebook_congress_merged]
for source in sources:
    create_id(source)

In [None]:
# Union tables
sources_combined = pd.concat(sources, ignore_index=True)

Unnamed: 0,index,index_x,op_id,op_gender,post_id,post_text,post_type,index_y,responder_id,response_text,op_name,op_category,source,sourceID,responder_gender,subreddit,op_gender_visible,responder_gender_visible
0,0,0,11679984,M,0,Tornado watch in effect tonight. Be safe. Plea...,photo,0,Michelle,Is this watch going to make it to LaPorte county?,Byron Miranda,Wikipedia_American_television_news_anchors,FW,FW0,,,,
1,1,0,11679984,M,0,Tornado watch in effect tonight. Be safe. Plea...,photo,1,Melissa,Anything for Wilmington area,Byron Miranda,Wikipedia_American_television_news_anchors,FW,FW1,,,,
2,2,0,11679984,M,0,Tornado watch in effect tonight. Be safe. Plea...,photo,2,Darlene,"Thanks, please keep us posted.",Byron Miranda,Wikipedia_American_television_news_anchors,FW,FW2,,,,
3,3,0,11679984,M,0,Tornado watch in effect tonight. Be safe. Plea...,photo,3,Cheryl,Thanks Byron,Byron Miranda,Wikipedia_American_television_news_anchors,FW,FW3,,,,
4,4,0,11679984,M,0,Tornado watch in effect tonight. Be safe. Plea...,photo,4,Melissa,[[STICKER]],Byron Miranda,Wikipedia_American_television_news_anchors,FW,FW4,,,,


In [None]:
# Map gender to binary indicator
sources_combined['op_gender_binary'] = sources_combined['op_gender'].map({'W': 0, 'M': 1})

In [None]:
# Write files to folder
sources_combined.to_csv(sources_combined_output, index=False)
sources_combined.head()