# Import, Download, & Variable Statements

In [1]:
# Import & download statements
import pandas as pd
from sklearn.model_selection import train_test_split
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/gibsonce/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/gibsonce/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [21]:
# Variables
extract_path = '/home/gibsonce/datallah-jaymefis-gibsonce/'
csv_files = ['facebook_wiki_posts','facebook_wiki_responses','fitocracy_posts','fitocracy_responses','reddit_posts','reddit_responses','ted_responses','facebook_congress_posts','facebook_congress_responses','annotations']
log_ngram_pred_path = '/home/gibsonce/datallah-jaymefis-gibsonce/log_ngram_preds.csv'

# Load Source Data

In [3]:
# Raw Pandas Dataframes
facebook_wiki_posts_raw = pd.read_csv(extract_path+'facebook_wiki_posts.csv')
facebook_wiki_responses_raw = pd.read_csv(extract_path+'facebook_wiki_responses.csv')
fitocracy_posts_raw = pd.read_csv(extract_path+'fitocracy_posts.csv')
fitocracy_responses_raw = pd.read_csv(extract_path+'fitocracy_responses.csv')
reddit_posts_raw = pd.read_csv(extract_path+'reddit_posts.csv')
reddit_responses_raw = pd.read_csv(extract_path+'reddit_responses.csv')
ted_responses_raw = pd.read_csv(extract_path+'ted_responses.csv')
annotations_raw = pd.read_csv(extract_path+'annotations.csv')
facebook_congress_posts_raw = pd.read_csv(extract_path+'facebook_congress_posts.csv')
facebook_congress_responses_raw = pd.read_csv(extract_path+'facebook_congress_responses.csv')

# Source Data Cleaning

## Define Functions

In [4]:
# Copy Dataframes to not overwrite original
def reset_dfs():
    global facebook_wiki_posts, facebook_wiki_responses, fitocracy_posts, fitocracy_responses,reddit_posts, reddit_responses,ted_responses, annotations,facebook_congress_posts, facebook_congress_responses
    
    facebook_wiki_posts = facebook_wiki_posts_raw.reset_index()
    facebook_wiki_responses = facebook_wiki_responses_raw.reset_index()
    fitocracy_posts = fitocracy_posts_raw.reset_index()
    fitocracy_responses = fitocracy_responses_raw.reset_index()
    reddit_posts = reddit_posts_raw.reset_index()
    reddit_responses = reddit_responses_raw.reset_index()
    ted_responses = ted_responses_raw.reset_index()
    annotations = annotations_raw.reset_index()
    facebook_congress_posts = facebook_congress_posts_raw.reset_index()
    facebook_congress_responses = facebook_congress_responses_raw.reset_index()

In [5]:
# Function to create UID
def create_id(df):
    df['sourceID'] = df['source']+df['index'].astype(str)

In [6]:
# Drop common columns then merge post and response
def post_merge(post_df, response_df):
    temp_df = response_df.drop('op_gender', axis=1)
    merged = pd.merge(post_df, temp_df, on=['op_id', 'post_id'], how='inner')
    merged = merged.reset_index()
    return merged

## Responses Only

In [7]:
# Reset dataframes to raw data
reset_dfs()

In [8]:
# Create source field
facebook_wiki_responses['source'] = 'FW'
fitocracy_responses['source'] = 'F'
reddit_responses['source'] = 'R'
facebook_congress_responses['source'] = 'FC'
ted_responses['source'] = 'T'

In [9]:
# Loop through sources and create UID
sources = [facebook_wiki_responses,fitocracy_responses,reddit_responses,facebook_congress_responses,ted_responses]
for source in sources:
    create_id(source)

In [10]:
# Union tables
responses_combined = pd.concat(sources, ignore_index=True)

In [11]:
# Map gender to binary indicator
responses_combined['op_gender_binary'] = responses_combined['op_gender'].map({'W': 0, 'M': 1})

# Drop NA text
responses_combined = responses_combined.dropna(subset=['response_text'])
responses_combined = responses_combined.dropna(subset=['op_gender_binary'])

In [12]:
responses_combined.head()

Unnamed: 0,index,op_id,op_gender,post_id,responder_id,response_text,op_name,op_category,source,sourceID,responder_gender,subreddit,responder_gender_visible,op_gender_binary
0,0,11679984,M,0,Michelle,Is this watch going to make it to LaPorte county?,Byron Miranda,Wikipedia_American_television_news_anchors,FW,FW0,,,,1.0
1,1,11679984,M,0,Melissa,Anything for Wilmington area,Byron Miranda,Wikipedia_American_television_news_anchors,FW,FW1,,,,1.0
2,2,11679984,M,0,Darlene,"Thanks, please keep us posted.",Byron Miranda,Wikipedia_American_television_news_anchors,FW,FW2,,,,1.0
3,3,11679984,M,0,Cheryl,Thanks Byron,Byron Miranda,Wikipedia_American_television_news_anchors,FW,FW3,,,,1.0
4,4,11679984,M,0,Melissa,[[STICKER]],Byron Miranda,Wikipedia_American_television_news_anchors,FW,FW4,,,,1.0


## Posts Only

In [13]:
# Reset dataframes to raw data
reset_dfs()

In [14]:
# Create source field
facebook_wiki_posts['source'] = 'FW'
fitocracy_posts['source'] = 'F'
reddit_posts['source'] = 'R'
facebook_congress_posts['source'] = 'FC'

In [15]:
# Loop through sources and create UID
sources = [facebook_wiki_posts,fitocracy_posts,reddit_posts,facebook_congress_posts]
for source in sources:
    create_id(source)

In [16]:
# Union tables
posts_combined = pd.concat(sources, ignore_index=True)
posts_combined.head()

Unnamed: 0,index,op_id,op_gender,post_id,post_text,post_type,source,sourceID,subreddit,op_gender_visible
0,0,11679984,M,0,Tornado watch in effect tonight. Be safe. Plea...,photo,FW,FW0,,
1,1,11679984,M,1,,photo,FW,FW1,,
2,2,11679984,M,2,Temps warming up for the holiday weekend! I wi...,photo,FW,FW2,,
3,3,11679984,M,3,Showers heading this way ..,photo,FW,FW3,,
4,4,11679984,M,4,Storm potential update...feel free to share th...,photo,FW,FW4,,


In [17]:
# Map gender to binary indicator
posts_combined['op_gender_binary'] = posts_combined['op_gender'].map({'W': 0, 'M': 1})

# Drop NA text
posts_combined = posts_combined.dropna(subset=['post_text'])

## Merging Posts and Responses

In [18]:
# Reset dataframes to raw data
reset_dfs()

In [19]:
# Drop common columns
reddit_responses = reddit_responses.drop('subreddit', axis=1)

In [22]:
# Merge applicable dataframes
facebook_wiki_merged = post_merge(facebook_wiki_posts, facebook_wiki_responses)
fitocracy_merged = post_merge(fitocracy_posts, fitocracy_responses)
reddit_merged = post_merge(reddit_posts, reddit_responses)
facebook_congress_merged = post_merge(facebook_congress_posts, facebook_congress_responses)

In [23]:
# Create source field
facebook_wiki_merged['source'] = 'FW'
fitocracy_merged['source'] = 'F'
reddit_merged['source'] = 'R'
facebook_congress_merged['source'] = 'FC'

In [24]:
# Loop through sources and create UID
sources = [facebook_wiki_merged,fitocracy_merged,reddit_merged,facebook_congress_merged]
for source in sources:
    create_id(source)

In [25]:
# Union tables
sources_combined = pd.concat(sources, ignore_index=True)
sources_combined.head()

Unnamed: 0,index,index_x,op_id,op_gender,post_id,post_text,post_type,index_y,responder_id,response_text,op_name,op_category,source,sourceID,responder_gender,subreddit,op_gender_visible,responder_gender_visible
0,0,0,11679984,M,0,Tornado watch in effect tonight. Be safe. Plea...,photo,0,Michelle,Is this watch going to make it to LaPorte county?,Byron Miranda,Wikipedia_American_television_news_anchors,FW,FW0,,,,
1,1,0,11679984,M,0,Tornado watch in effect tonight. Be safe. Plea...,photo,1,Melissa,Anything for Wilmington area,Byron Miranda,Wikipedia_American_television_news_anchors,FW,FW1,,,,
2,2,0,11679984,M,0,Tornado watch in effect tonight. Be safe. Plea...,photo,2,Darlene,"Thanks, please keep us posted.",Byron Miranda,Wikipedia_American_television_news_anchors,FW,FW2,,,,
3,3,0,11679984,M,0,Tornado watch in effect tonight. Be safe. Plea...,photo,3,Cheryl,Thanks Byron,Byron Miranda,Wikipedia_American_television_news_anchors,FW,FW3,,,,
4,4,0,11679984,M,0,Tornado watch in effect tonight. Be safe. Plea...,photo,4,Melissa,[[STICKER]],Byron Miranda,Wikipedia_American_television_news_anchors,FW,FW4,,,,


In [26]:
# Map gender to binary indicator
sources_combined['op_gender_binary'] = sources_combined['op_gender'].map({'W': 0, 'M': 1})

# Supervised Learning

In [27]:
# Set train-test split variables
X = responses_combined['response_text']
y = responses_combined['op_gender_binary']

# Perform stratified train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=responses_combined['source']
)

In [28]:
# Text preprocessing and tokenization
vectorizer = CountVectorizer(stop_words='english', ngram_range=(1, 2))
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

In [None]:
# Logistic Regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train_vectorized, y_train)

# Predictions on the test set
y_pred = model.predict(X_test_vectorized)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

In [None]:
# Create DataFrame with the predictions
df_predictions = pd.DataFrame({'Predictions': y_pred})

# Save the DataFrame to a CSV file
df_predictions.to_csv(log_ngram_pred_path, index=False)

In [None]:
# tqdm

# Unused Code

# Text field preprocessing
def preprocess_text(text):
    # Lowercase the text
    text = text.lower()
    # Remove punctuation
    text = ''.join([char for char in text if char not in string.punctuation])
    # Tokenization, remove stop words, and lemmatization
    tokens = [lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words]
    return ' '.join(tokens)

sources_combined['post_text_processed'] = sources_combined['post_text'].apply(preprocess_text)

!pip install pyspark
from pyspark.sql import SparkSession

### Variables
extract_path = '/home/gibsonce/datallah-jaymefis-gibsonce/'
csv_files = ['facebook_wiki_posts','facebook_wiki_responses','fitocracy_posts','fitocracy_responses','reddit_posts','reddit_responses','ted_responses','facebook_congress_posts','facebook_congress_responses','annotations']

### Spark session
spark = SparkSession.builder.appName('example').getOrCreate()

### Spark Dataframes
facebook_wiki_posts = spark.read.csv(extract_path+'facebook_wiki_posts.csv', header=True, inferSchema=True)
facebook_wiki_responses = spark.read.csv(extract_path+'facebook_wiki_responses.csv', header=True, inferSchema=True)
fitocracy_posts = spark.read.csv(extract_path+'fitocracy_posts.csv', header=True, inferSchema=True)
fitocracy_responses = spark.read.csv(extract_path+'fitocracy_responses.csv', header=True, inferSchema=True)
reddit_posts = spark.read.csv(extract_path+'reddit_posts.csv', header=True, inferSchema=True)
reddit_responses = spark.read.csv(extract_path+'reddit_responses.csv', header=True, inferSchema=True)
ted_responses = spark.read.csv(extract_path+'ted_responses.csv', header=True, inferSchema=True)
annotations = spark.read.csv(extract_path+'annotations.csv', header=True, inferSchema=True)
facebook_congress_posts = spark.read.csv(extract_path+'facebook_congress_posts.csv', header=True, inferSchema=True)
facebook_congress_responses = spark.read.csv(extract_path+'facebook_congress_responses.csv', header=True, inferSchema=True)
    
