In [1]:
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time

In [2]:
### Loading in dataframes

In [3]:
iphone = pd.read_csv('iphone.csv')

In [4]:
android = pd.read_csv('android.csv')

In [5]:
iphone.head()

Unnamed: 0.1,Unnamed: 0,subreddit,selftext,title
0,0,iphone,[removed],What's App Slow on iPhone compared to Android
1,1,iphone,,Iphone X expansive offer (Email Submit).
2,2,iphone,,"Got my first job, saved up for over a year, an..."
3,3,iphone,,Get S&amp; P iPhone XS
4,4,iphone,[removed],Iphone 12 mini standby issues


In [6]:
android.head()

Unnamed: 0.1,Unnamed: 0,subreddit,selftext,title
0,0,Android,,OnePlus broke Google Play on 7T Pro with lates...
1,1,Android,[removed],Blinking battery arrow
2,2,Android,[removed],I need help pls
3,3,Android,[removed],M31 Android 11
4,4,Android,[removed],Where are wallpapers saved to?


In [7]:
### Combining datasets into one

In [8]:
df = pd.concat([iphone, android])

In [9]:
df.head()

Unnamed: 0.1,Unnamed: 0,subreddit,selftext,title
0,0,iphone,[removed],What's App Slow on iPhone compared to Android
1,1,iphone,,Iphone X expansive offer (Email Submit).
2,2,iphone,,"Got my first job, saved up for over a year, an..."
3,3,iphone,,Get S&amp; P iPhone XS
4,4,iphone,[removed],Iphone 12 mini standby issues


In [10]:
### Dropping unnecessary columns

In [11]:
df = df.drop(columns='Unnamed: 0')

In [12]:
df.head()

Unnamed: 0,subreddit,selftext,title
0,iphone,[removed],What's App Slow on iPhone compared to Android
1,iphone,,Iphone X expansive offer (Email Submit).
2,iphone,,"Got my first job, saved up for over a year, an..."
3,iphone,,Get S&amp; P iPhone XS
4,iphone,[removed],Iphone 12 mini standby issues


In [13]:
### Analyzing the selftext column, and setting values of null, '[removed]' and '[deleted]' to ''.

In [14]:
df['selftext'].value_counts()

[removed]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               

In [15]:
df.isnull().sum()

subreddit       0
selftext     6339
title           0
dtype: int64

In [16]:
### Filling null text values

In [17]:
df = df.fillna('')

In [18]:
df.isnull().sum()

subreddit    0
selftext     0
title        0
dtype: int64

In [19]:
### replacing non-text entries with an empty string to then combine the 'selftext' and 'title' columns 

In [20]:
 text_list = []
for value in df['selftext']:
    if value == '[removed]':
        text_list.append('')
    elif value == '[deleted]':
        text_list.append('')
    elif value == '':
        text_list.append('')
    else:
        text_list.append(value)
        
df['selftext'] = text_list

In [21]:
df['text'] = df['title'] + df['selftext'] 

In [22]:
df

Unnamed: 0,subreddit,selftext,title,text
0,iphone,,What's App Slow on iPhone compared to Android,What's App Slow on iPhone compared to Android
1,iphone,,Iphone X expansive offer (Email Submit).,Iphone X expansive offer (Email Submit).
2,iphone,,"Got my first job, saved up for over a year, an...","Got my first job, saved up for over a year, an..."
3,iphone,,Get S&amp; P iPhone XS,Get S&amp; P iPhone XS
4,iphone,,Iphone 12 mini standby issues,Iphone 12 mini standby issues
...,...,...,...,...
9995,Android,,[SELF] Recomendo - The app that gives tv shows...,[SELF] Recomendo - The app that gives tv shows...
9996,Android,,Ticwatch Pro 3 Review: Wear OS Finally Works!,Ticwatch Pro 3 Review: Wear OS Finally Works!
9997,Android,,[Mr. Mobile] Ticwatch Pro 3 Review: Wear OS Fi...,[Mr. Mobile] Ticwatch Pro 3 Review: Wear OS Fi...
9998,Android,,Ticwatch Pro 3 Review: Wear OS Finally Works!,Ticwatch Pro 3 Review: Wear OS Finally Works!


In [23]:
df['selftext'].value_counts()

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        

In [24]:
df = df.drop(columns=['selftext', 'title'])

In [25]:
### Creating new binary classification column for iphone vs android and dropping old 'subreddit' column.

In [26]:
sub_list = []
for value in df['subreddit']:
    if value == 'iphone':
        sub_list.append(1)
    else:
        sub_list.append(0)
        
df['iphone_subreddit'] = sub_list

In [27]:
df = df.drop(columns='subreddit')

In [28]:
df['text'][0]

0        What's App Slow on iPhone compared to Android
0    OnePlus broke Google Play on 7T Pro with lates...
Name: text, dtype: object

In [29]:
### resetting dataframe index

In [30]:
df.index = range(20000)

In [31]:
df

Unnamed: 0,text,iphone_subreddit
0,What's App Slow on iPhone compared to Android,1
1,Iphone X expansive offer (Email Submit).,1
2,"Got my first job, saved up for over a year, an...",1
3,Get S&amp; P iPhone XS,1
4,Iphone 12 mini standby issues,1
...,...,...
19995,[SELF] Recomendo - The app that gives tv shows...,0
19996,Ticwatch Pro 3 Review: Wear OS Finally Works!,0
19997,[Mr. Mobile] Ticwatch Pro 3 Review: Wear OS Fi...,0
19998,Ticwatch Pro 3 Review: Wear OS Finally Works!,0


In [32]:
### Exporting dataframe for modeling/sentiment analysis

In [33]:
df.to_csv('cleaned_df.csv')