# Import Packages

In [18]:
import numpy as np
import pandas as pd

# Read in Previously Collected Data

In [19]:
onion = pd.read_csv('../data/theonion_1682378516.csv')
wldnws = pd.read_csv('../data/worldnews_1682378600.csv')
print(onion.shape), print(wldnws.shape)

(5996, 100)
(5990, 94)


(None, None)

In [20]:
# Select only the columns that would be needed going forward.
#  Some of these may be dropped later but may make it easier to clean the data for now
onion = onion[['subreddit', 'selftext', 'title', 'subreddit_name_prefixed', 'whitelist_status', 'created_utc']]
wldnws = wldnws[['subreddit', 'selftext', 'title', 'subreddit_name_prefixed', 'whitelist_status', 'created_utc']]
onion.shape, wldnws.shape

((5996, 6), (5990, 6))

## Check for Duplicates

In [21]:
onion.drop_duplicates().shape, wldnws.drop_duplicates().shape

((5995, 6), (5990, 6))

Both dataframes have a comparable number of samples to start off with

In [22]:
onion['title'].shape, onion['title'].drop_duplicates().shape

((5996,), (5478,))

In [23]:
wldnws['title'].shape, wldnws['title'].drop_duplicates().shape

((5990,), (5413,))

Looking at the 'title' however, it can be seen that there are duplicate titles.  It may be intersting to investigate these further.

In [24]:
# Filter only values where the title is duplicated.
#  Set 'keep' = False to keep all instances so we can see how they are duplicated
#   Source:  https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.duplicated.html
onion[onion['title'].duplicated(keep = False) == True].sort_values('title').head(20)

Unnamed: 0,subreddit,selftext,title,subreddit_name_prefixed,whitelist_status,created_utc
3147,TheOnion,,109 Details About ‘Grand Theft Auto VI’ We’re ...,r/TheOnion,all_ads,1625354899
3155,TheOnion,,109 Details About ‘Grand Theft Auto VI’ We’re ...,r/TheOnion,all_ads,1625157320
3277,TheOnion,,5 Things To Know About ‘Friends: The Reunion’,,all_ads,1621956697
3262,TheOnion,,5 Things To Know About ‘Friends: The Reunion’,,all_ads,1622042399
3774,TheOnion,,A Quick And Simple Drywall Recipe That Kids Wi...,,all_ads,1612126697
2742,TheOnion,,A Quick And Simple Drywall Recipe That Kids Wi...,r/TheOnion,all_ads,1635622335
5720,TheOnion,,A Shocking Scene of Rebellion | The Onion Pres...,r/TheOnion,all_ads,1581115634
5718,TheOnion,,A Shocking Scene of Rebellion | The Onion Pres...,r/TheOnion,all_ads,1581117436
4013,TheOnion,[deleted],A shocking new study that asked teen boys abou...,,all_ads,1606664921
4012,TheOnion,,A shocking new study that asked teen boys abou...,,all_ads,1606664967


In [25]:
# Filter only values where the title is duplicated.
#  Set 'keep' = False to keep all instances so we can see how they are duplicated
#   Source:  https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.duplicated.html
wldnws[wldnws['title'].duplicated(keep = False) == True].sort_values('title').head(20)

Unnamed: 0,subreddit,selftext,title,subreddit_name_prefixed,whitelist_status,created_utc
396,worldnews,,"""Caminando en mis zapatos, el mundo se ve dife...",r/worldnews,all_ads,1682264788
397,worldnews,,"""Caminando en mis zapatos, el mundo se ve dife...",r/worldnews,all_ads,1682264753
5480,worldnews,,"'The richest man and woman in the world are, f...",r/worldnews,all_ads,1680806884
5879,worldnews,,"'The richest man and woman in the world are, f...",r/worldnews,all_ads,1680709934
3366,worldnews,,"18,000 cows killed in explosion, fire at Texas...",r/worldnews,all_ads,1681403121
3563,worldnews,,"18,000 cows killed in explosion, fire at Texas...",r/worldnews,all_ads,1681353067
5312,worldnews,,3 in 4 Russian Manufacturers Hit by Western Sa...,r/worldnews,all_ads,1680857457
5300,worldnews,,3 in 4 Russian Manufacturers Hit by Western Sa...,r/worldnews,all_ads,1680861097
613,worldnews,,70-year-old Russian woman who called President...,r/worldnews,all_ads,1682188221
665,worldnews,,70-year-old Russian woman who called President...,r/worldnews,all_ads,1682169612


### Observations:
It appears that for both dataframes the duplicates are resubmissions of the original submission.  Perhaps there were errors that were corrected, or users posting items more than once.

Either way, there are still over 5400 submissions for both The Onion and World News which should be enough.  As this analysis will only examine the titles, no further investigations into duplicates will need to be performed and they can be dropped.

Before doing so however, it would be good to very null counts in each dataframe.

In [26]:
onion.isnull().sum(), wldnws.isnull().sum()

(subreddit                     0
 selftext                   5483
 title                         0
 subreddit_name_prefixed    1994
 whitelist_status              0
 created_utc                   0
 dtype: int64,
 subreddit                     0
 selftext                   5768
 title                         0
 subreddit_name_prefixed       0
 whitelist_status              0
 created_utc                   0
 dtype: int64)

### Observations:
As only the titles are being examined and there are no nulls in either column, null values do not need to be dropped.  The analysis will continue by dropping duplicates.

In [27]:
# Drop the duplicates for each dataset based on the title columns.
#  Source to find 'subset' argument:  Doc String
onion.drop_duplicates(subset = 'title', inplace=True)
wldnws.drop_duplicates(subset = 'title', inplace=True)
onion.shape, wldnws.shape

((5478, 6), (5413, 6))

In [28]:
# Find the sum of duplicated timestamps in the 'created_utc' column to determine if there are any duplicates missed by the previous steps.
onion['created_utc'].duplicated().sum()

52

In [30]:
# Filter the dataframe to show only timestamp duplicates so it can be determined if the titles look similar in any way.
#  Again, the keep is set to false to see all duplciated entries.
onion[onion['created_utc'].duplicated(keep = False) == True].sort_values('created_utc')

Unnamed: 0,subreddit,selftext,title,subreddit_name_prefixed,whitelist_status,created_utc
5019,TheOnion,,6 Ways To Make Your Neighbor Move Away Using N...,,all_ads,1590678208
5018,TheOnion,,‘Invest In My Blood Idea Before I Jab You’: An...,,all_ads,1590678208
5012,TheOnion,,White House Press Secretary: ‘Trump’s Critics ...,,all_ads,1590693023
5013,TheOnion,,Protestors Criticized For Looting Businesses W...,,all_ads,1590693023
5002,TheOnion,,Woman Needs To Shut Up And Allow Man To Be Ally,,all_ads,1590757811
...,...,...,...,...,...,...
110,TheOnion,,Report Reveals Only Jeff Bezos Finished Watchi...,r/TheOnion,all_ads,1680871722
36,TheOnion,,Tee Ball Coach Reminds Players To Use Both Han...,r/TheOnion,all_ads,1681898656
35,TheOnion,,Fan Respects Women Too Much To See Their Bodie...,r/TheOnion,all_ads,1681898656
4,TheOnion,,Dog And Owner Having Public Fight,r/TheOnion,all_ads,1682337233


### Observations:
A secondary check of the sumission creation times, 'created_utc' shows that there are some submissions that were posted at the same epoch time.  By sorting on the time it is clear that these are not duplicate title entries.

Thus, these duplciates can be ignored and do not need to be processed.

Let's examine the datasets again to eventually drop unneeded columns

In [12]:
onion.info(), onion.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5478 entries, 0 to 5995
Data columns (total 6 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   subreddit                5478 non-null   object
 1   selftext                 354 non-null    object
 2   title                    5478 non-null   object
 3   subreddit_name_prefixed  3685 non-null   object
 4   whitelist_status         5478 non-null   object
 5   created_utc              5478 non-null   int64 
dtypes: int64(1), object(5)
memory usage: 299.6+ KB


(None,
   subreddit selftext                                              title  \
 0  TheOnion      NaN  Idiot Tornado Tears Harmlessly Through Empty F...   
 1  TheOnion      NaN  New Texas Law Requires Schools To Display Imag...   
 2  TheOnion      NaN  New Poll Finds Americans Would Respect Biden M...   
 3  TheOnion      NaN  Could You Pass Racial Discrimination Training ...   
 4  TheOnion      NaN                  Dog And Owner Having Public Fight   
 
   subreddit_name_prefixed whitelist_status  created_utc  
 0              r/TheOnion          all_ads   1682364316  
 1              r/TheOnion          all_ads   1682363827  
 2              r/TheOnion          all_ads   1682361260  
 3              r/TheOnion          all_ads   1682337233  
 4              r/TheOnion          all_ads   1682337233  )

The columns 'subreddit_name_prefixed', 'whitelist_status', and 'created_utc' are no longer needed, but the selftext columns should be examined to determine if they contain any information such as an overflow of the title.

In [35]:
# Filter the dataframe to view only samples where the selftext is not null
onion[onion['selftext'].isna() == False]

Unnamed: 0,subreddit,selftext,title,subreddit_name_prefixed,whitelist_status,created_utc
87,TheOnion,[removed],Not the onion this time,r/TheOnion,all_ads,1681269849
126,TheOnion,[removed],Who has the video,r/TheOnion,all_ads,1680725923
215,TheOnion,[removed],Illinois Bakery Named Best Bakery In The Entir...,r/TheOnion,all_ads,1679661072
722,TheOnion,"Published July 7, 2004",Nation’s Liberals Suffering From Outrage Fatigue,r/TheOnion,all_ads,1671488700
808,TheOnion,[removed],The Dunning Kruger Effect,r/TheOnion,all_ads,1670122697
...,...,...,...,...,...,...
5916,TheOnion,[deleted],Swing Voter Really Relates To Buttigieg’s Comp...,r/TheOnion,all_ads,1579025700
5930,TheOnion,[deleted],Saudi authorities close down shop selling trad...,r/TheOnion,all_ads,1578785091
5943,TheOnion,[deleted],"An oldie, but a goodie",r/TheOnion,all_ads,1578599249
5962,TheOnion,[deleted],"Girl, 9, disappears using cream that makes you...",r/TheOnion,all_ads,1578369505


In [36]:
# Many entries are removed or deleted, but value counts can help find unique entries quickly
onion[onion['selftext'].isna() == False]['selftext'].value_counts()

[deleted]                 344
[removed]                   9
Published July 7, 2004      1
Name: selftext, dtype: int64

### Observations:
There are only three values populated into the selftext and none of them are useful.

This will be repeated on the World News data, and if the results are similar, the 'selftext' column in both dataframes will be dropped.

In [37]:
wldnws.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5413 entries, 0 to 5988
Data columns (total 6 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   subreddit                5413 non-null   object
 1   selftext                 198 non-null    object
 2   title                    5413 non-null   object
 3   subreddit_name_prefixed  5413 non-null   object
 4   whitelist_status         5413 non-null   object
 5   created_utc              5413 non-null   int64 
dtypes: int64(1), object(5)
memory usage: 296.0+ KB


In [39]:
# Filter the dataframe to view only samples where the selftext is not null
wldnws[wldnws['selftext'].isna() == False]

Unnamed: 0,subreddit,selftext,title,subreddit_name_prefixed,whitelist_status,created_utc
7,worldnews,[deleted],NL uses potentially biased algorithm for visa ...,r/worldnews,all_ads,1682376013
21,worldnews,[deleted],How Chinese censorship is quietly rewriting Co...,r/worldnews,all_ads,1682371953
52,worldnews,[removed],Russian State Outlet RT Reaches Out to Tucker ...,r/worldnews,all_ads,1682363126
78,worldnews,[removed],Brics draws membership requests from 19 nation...,r/worldnews,all_ads,1682356537
86,worldnews,[removed],Tucker Carlson Is Gone From Fox News,r/worldnews,all_ads,1682354338
...,...,...,...,...,...,...
5741,worldnews,[removed],Israeli Forces Draw Condemnation Over 'Barbari...,r/worldnews,all_ads,1680742365
5783,worldnews,[removed],"Taiwan leader, US Speaker McCarthy meet in Cal...",r/worldnews,all_ads,1680727519
5921,worldnews,[deleted],"Credit Suisse ‘could not be saved,’ chairman s...",r/worldnews,all_ads,1680702660
5935,worldnews,[removed],"In Poland, the Catholic church backed abortion...",r/worldnews,all_ads,1680699508


In [40]:
# Many entries are removed or deleted, but value counts can help find unique entries quickly
wldnws[wldnws['selftext'].isna() == False]['selftext'].value_counts()

[removed]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               

In [45]:
# These posts can be investigated to make sure that the titles provide useful information without the selftext
#  Creat a filter like previously, but with additional conditions to exclude cells where the selftext is 'removed' or 'deleted'

wldnws[(wldnws['selftext'].isna() == False) & (wldnws['selftext'] != '[removed]') & (wldnws['selftext'] != '[deleted]')]

Unnamed: 0,subreddit,selftext,title,subreddit_name_prefixed,whitelist_status,created_utc
666,worldnews,Twitter’s new rules limit the number of automa...,Disaster alert accounts are preparing for a wo...,r/worldnews,all_ads,1682169477
776,worldnews,This request was previously blocked by the US....,Israel holding advanced talks with Germany on ...,r/worldnews,all_ads,1682121969
804,worldnews,Air Canada's cargo operations were handling th...,What genius trusted Air Canada with a $20 Mill...,r/worldnews,all_ads,1682110918
828,worldnews,"Kupiansk downtown looks deserted: shops, pharm...",Constant Missile Attacks and Empty Streets: Ho...,r/worldnews,all_ads,1682100806
861,worldnews,"In 2021, a small boat carrying the bodies of m...","We are Renata Brito and Felipe Dana, journalis...",r/worldnews,all_ads,1682093522
1111,worldnews,Five Army soldiers were killed Thursday aftern...,"Five soldiers killed in J&amp;K terror attack,...",r/worldnews,all_ads,1682020248
1403,worldnews,Lula says that people with mental disorders ha...,Lula (Brazilian president) says that people wi...,r/worldnews,all_ads,1681948544
1503,worldnews,PROOF: https://i.redd.it/hzvrwow9qqua1.jpg\n\n...,I'm a tech journalist who's been covering how ...,r/worldnews,all_ads,1681919689


In [48]:
wldnws[(wldnws['selftext'].isna() == False) & (wldnws['selftext'] != '[removed]') & (wldnws['selftext'] != '[deleted]')].index

Int64Index([666, 776, 804, 828, 861, 1111, 1403, 1503], dtype='int64')

In [49]:
wldnws['title'][0]

'The parents of a 10-year-old boy living with autism, ADHD and anxiety are "sickened" after their son was mocked in a video created by a teacher, which was shown to staff and posted online'

In [66]:
# To read this a bit better, let's print of the title and subtext for each, and format accordingly with f-strings to make it easier to read
#  In the first line, the code is simply grabbing the index of each value where the selftext is populated with something readable.
#  The loop then iterates through each index and prints the title and subtect for that index value using it to call from the column series.
for i in wldnws[(wldnws['selftext'].isna() == False) & (wldnws['selftext'] != '[removed]') & (wldnws['selftext'] != '[deleted]')].index:
    print(f"Title:  {wldnws['title'][i]} \nSelftext:  {wldnws['selftext'][i]} \n {'-'*200} \n")

Title:  Disaster alert accounts are preparing for a world after Twitter 
Selftext:  Twitter’s new rules limit the number of automated tweets an account can send out without paying, making it a less reliable place for emergency alerts 
 -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- 

Title:  Israel holding advanced talks with Germany on sale of Arrow 3 
Selftext:  This request was previously blocked by the US. Looks like its firmly back on the table. This is a joint US/Israeli system and is extremely capable. One of the very few ABM systems to intercept multiple targets simultaneously in testing. One of them was outside the atmosphere during intercept. 
 ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

### Observations:
* It can be seen that the titles are relatively complete and that the self text is some sort of short synopsis of the story.  These do not need to be included.
* It can also be seen that some characters like ampersands are not being processed well.  These should be converted if possible.

***Thus, for both the Onion and World News dataframes, the 'selftext' columns can be dropped.***