# Real Talk

Cleaning data and doing EDA for initial Product Analysis and scoping potential projects

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from glob import glob

# used to grab data from local machine
# data_files = glob('')

In [2]:
# loading in data
reactions = pd.read_csv(data_files[1])
reactions.columns = [column.lower().replace(' ', '_') for column in reactions.columns]
reactions.rename(columns={
    'id':'reaction_id'
    , 'story':'story_id'
}, inplace=True)


stories = pd.read_csv(data_files[0])
stories.columns = [column.lower().replace(' ', '_') for column in stories.columns]
stories.rename(columns={
    'id':'story_id'
}, inplace=True)
stories.head()

Unnamed: 0,story_id,perspective,age,lgbtq,race,phone,topic,title,lede,published_at,...,cringey,haha,me_too,interesting,story_texts,link_url,link_photo_url,link_title,link_site_name,link_body
0,2269,female,19.0,no,white,870,relationships,,,,...,0,0,0,0,Well I was in my house and when I asked my eye...,,,,,
1,2268,female,18.0,no,white,3253073887,bullying,,I just need to work on my confidence,2020-01-06 00:00:00 UTC,...,1,1,1,1,My confident isnt the best. Its not with the w...,https://kidshealth.org/en/teens/self-esteem.html,,How can I improve my self-esteem?,Teens Health,
2,2267,female,18.0,yes,other,(304) 691-3740,puberty,,,,...,0,0,0,0,His name is Jeffery but we call him jeff💖; I a...,,,,,
3,2266,female,22.0,yes,white,0575809857,bullying,,,,...,0,0,0,0,The was Robert My boyfriend ; Nothing really. ...,,,,,
4,2265,female,15.0,yes,Latina,9793642515,puberty,,,,...,0,0,0,0,They were my ex and I still loved him.💘But the...,,,,,


In [3]:
reactions.head()

Unnamed: 0,reaction_id,story_id,story_text,emoji,created_at
0,9730,2165.0,,❤️,2019-12-30 19:32:18 UTC
1,9729,2165.0,10786.0,😭,2019-12-30 17:25:38 UTC
2,9728,1848.0,,❤️,2019-12-30 06:26:23 UTC
3,9727,390.0,2009.0,❤️,2019-12-29 20:38:12 UTC
4,9726,2165.0,,👀,2019-12-29 20:33:34 UTC


## Data Cleaning
- grouping emojis together since there seems to be a data integrity issue
- binarizing the publishing statistics
- converting columns into datetimes where appropriate
- joining reactions table to stories table to get story statistics

In [4]:
reactions.emoji.value_counts()

✌🏽    3001
❤️    2845
👀     1597
😭     1157
😅      819
♥️       5
✌        3
Name: emoji, dtype: int64

In [5]:
reactions.emoji = reactions.emoji.replace('♥️', '❤️')
reactions.emoji = reactions.emoji.replace('✌🏽', '✌')

In [6]:
stories['is_published'] = stories.published_at.notna()

In [7]:
# mapping to stories df to get additional dimensions
cleaned_reactions = pd.merge(reactions, stories[['story_id', 'published_at','is_published', 'topic']]
                            , left_on='story_id', right_on='story_id', how='left')

In [8]:
# changing columns to datetime instead of strings
cleaned_reactions['created_at'] = pd.to_datetime(cleaned_reactions['created_at'])
cleaned_reactions['published_at'] = pd.to_datetime(cleaned_reactions['published_at'])

cleaned_reactions['days_since_published'] = (cleaned_reactions['created_at'] - cleaned_reactions['published_at']).dt.days

In [9]:
# sanity check
print(len(cleaned_reactions))
cleaned_reactions.dropna(subset=['story_id'], inplace=True)
print(len(cleaned_reactions))

9427
9413


In [10]:
# saving to CSV for tableau
cleaned_reactions.to_csv('data/cleaned_reactions.csv', index=False)
stories.to_csv('data/cleaned_stories.csv', index=True)