### Data Cleaning

- Combining SpaceX and NASA
- Dropping unneded columns and duplicates
- Check for nulls
- Saving csv

In [1]:
import numpy as np
import scipy.stats as stats
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

%config InlineBackend.figure_format = 'retina'
%matplotlib inline

plt.style.use('fivethirtyeight')

### Exploratory Data Analysis

#### Load Data

In [2]:
spacex_df = pd.read_csv('../data/spacex_df.csv')
nasa_df   = pd.read_csv('../data/nasa_df.csv')

In [4]:
df = pd.concat([spacex_df, nasa_df], sort=True)

In [5]:
df.columns

Index(['Unnamed: 0', 'approved_at_utc', 'approved_by', 'archived', 'author',
       'author_cakeday', 'author_flair_background_color',
       'author_flair_css_class', 'author_flair_richtext',
       'author_flair_template_id', 'author_flair_text',
       'author_flair_text_color', 'author_flair_type', 'author_fullname',
       'banned_at_utc', 'banned_by', 'can_gild', 'can_mod_post', 'category',
       'clicked', 'content_categories', 'contest_mode', 'created',
       'created_utc', 'crosspost_parent', 'crosspost_parent_list',
       'distinguished', 'domain', 'downs', 'edited', 'gilded', 'hidden',
       'hide_score', 'id', 'is_crosspostable', 'is_meta',
       'is_original_content', 'is_reddit_media_domain', 'is_self', 'is_video',
       'likes', 'link_flair_background_color', 'link_flair_css_class',
       'link_flair_richtext', 'link_flair_template_id', 'link_flair_text',
       'link_flair_text_color', 'link_flair_type', 'locked', 'media',
       'media_embed', 'media_metadata', 

In [6]:
df.drop('Unnamed: 0', axis=1, inplace=True)

In [14]:
df.drop_duplicates(inplace=True)

In [15]:
df.head()

Unnamed: 0,approved_at_utc,approved_by,archived,author,author_cakeday,author_flair_background_color,author_flair_css_class,author_flair_richtext,author_flair_template_id,author_flair_text,...,thumbnail_height,thumbnail_width,title,ups,url,user_reports,view_count,visited,whitelist_status,wls
0,,,False,ElongatedMuskrat,,,contributor,[],,r/SpaceX Bot,...,,,Telstar 18V / APStar 5C Launch Campaign Thread,242,https://www.reddit.com/r/spacex/comments/95cte...,[],,False,all_ads,6
1,,,False,ElongatedMuskrat,,,contributor,[],,r/SpaceX Bot,...,,,"r/SpaceX Discusses [September 2018, #48]",166,https://www.reddit.com/r/spacex/comments/9ckoe...,[],,False,all_ads,6
2,,,False,MingerOne,,,,[],,,...,140.0,140.0,Static fire test of Falcon 9 complete—targetin...,543,https://twitter.com/SpaceX/status/103734504243...,[],,False,all_ads,6
3,,,False,MingerOne,,,,[],,,...,84.0,140.0,SpaceX’s second dedicated USAF mission targets...,510,https://www.teslarati.com/spacex-second-usaf-m...,[],,False,all_ads,6
4,,,False,jclishman,,,contributor,[],,Host of Inmarsat-5 Flight 4,...,105.0,140.0,SpaceX Monthly Recap | August 2018 | Crew arm ...,442,https://youtu.be/_S5zUOq5zSc,[],,False,all_ads,6


In [16]:
print(df.selftext.isnull().sum())
print(df.selftext.isnull().sum() / len(df))

2047
0.8441237113402061


In [17]:
df.title.isnull().sum()

0

In [18]:
df.shape

(2425, 98)

In [19]:
df.subreddit.value_counts()

spacex    1248
nasa      1177
Name: subreddit, dtype: int64

#### Saved Files

In [20]:
df.to_csv('../data/df_final.csv')