# Import Packages

In [1]:
import numpy as np
import pandas as pd
import re

# Read in Previously Collected Data

In [2]:
onion = pd.read_csv('../data/theonion_1682378516.csv')
wldnws = pd.read_csv('../data/worldnews_1682378600.csv')
print(onion.shape), print(wldnws.shape)

(5996, 100)
(5990, 94)


(None, None)

In [3]:
# Select only the columns that would be needed going forward.
#  Some of these may be dropped later but may make it easier to clean the data for now
onion = onion[['subreddit', 'selftext', 'title', 'subreddit_name_prefixed', 'whitelist_status', 'created_utc']]
wldnws = wldnws[['subreddit', 'selftext', 'title', 'subreddit_name_prefixed', 'whitelist_status', 'created_utc']]
onion.shape, wldnws.shape

((5996, 6), (5990, 6))

## Check for Duplicates

In [4]:
onion.drop_duplicates().shape, wldnws.drop_duplicates().shape

((5995, 6), (5990, 6))

Both dataframes have a comparable number of samples to start off with

In [5]:
onion['title'].shape, onion['title'].drop_duplicates().shape

((5996,), (5478,))

In [6]:
wldnws['title'].shape, wldnws['title'].drop_duplicates().shape

((5990,), (5413,))

Looking at the 'title' however, it can be seen that there are duplicate titles.  It may be intersting to investigate these further.

In [7]:
# Filter only values where the title is duplicated.
#  Set 'keep' = False to keep all instances so we can see how they are duplicated
#   Source:  https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.duplicated.html
onion[onion['title'].duplicated(keep = False) == True].sort_values('title').head(20)

Unnamed: 0,subreddit,selftext,title,subreddit_name_prefixed,whitelist_status,created_utc
3147,TheOnion,,109 Details About ‘Grand Theft Auto VI’ We’re ...,r/TheOnion,all_ads,1625354899
3155,TheOnion,,109 Details About ‘Grand Theft Auto VI’ We’re ...,r/TheOnion,all_ads,1625157320
3277,TheOnion,,5 Things To Know About ‘Friends: The Reunion’,,all_ads,1621956697
3262,TheOnion,,5 Things To Know About ‘Friends: The Reunion’,,all_ads,1622042399
3774,TheOnion,,A Quick And Simple Drywall Recipe That Kids Wi...,,all_ads,1612126697
2742,TheOnion,,A Quick And Simple Drywall Recipe That Kids Wi...,r/TheOnion,all_ads,1635622335
5720,TheOnion,,A Shocking Scene of Rebellion | The Onion Pres...,r/TheOnion,all_ads,1581115634
5718,TheOnion,,A Shocking Scene of Rebellion | The Onion Pres...,r/TheOnion,all_ads,1581117436
4013,TheOnion,[deleted],A shocking new study that asked teen boys abou...,,all_ads,1606664921
4012,TheOnion,,A shocking new study that asked teen boys abou...,,all_ads,1606664967


In [8]:
# Filter only values where the title is duplicated.
#  Set 'keep' = False to keep all instances so we can see how they are duplicated
#   Source:  https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.duplicated.html
wldnws[wldnws['title'].duplicated(keep = False) == True].sort_values('title').head(20)

Unnamed: 0,subreddit,selftext,title,subreddit_name_prefixed,whitelist_status,created_utc
396,worldnews,,"""Caminando en mis zapatos, el mundo se ve dife...",r/worldnews,all_ads,1682264788
397,worldnews,,"""Caminando en mis zapatos, el mundo se ve dife...",r/worldnews,all_ads,1682264753
5480,worldnews,,"'The richest man and woman in the world are, f...",r/worldnews,all_ads,1680806884
5879,worldnews,,"'The richest man and woman in the world are, f...",r/worldnews,all_ads,1680709934
3366,worldnews,,"18,000 cows killed in explosion, fire at Texas...",r/worldnews,all_ads,1681403121
3563,worldnews,,"18,000 cows killed in explosion, fire at Texas...",r/worldnews,all_ads,1681353067
5312,worldnews,,3 in 4 Russian Manufacturers Hit by Western Sa...,r/worldnews,all_ads,1680857457
5300,worldnews,,3 in 4 Russian Manufacturers Hit by Western Sa...,r/worldnews,all_ads,1680861097
613,worldnews,,70-year-old Russian woman who called President...,r/worldnews,all_ads,1682188221
665,worldnews,,70-year-old Russian woman who called President...,r/worldnews,all_ads,1682169612


### Observations:
It appears that for both dataframes the duplicates are resubmissions of the original submission.  Perhaps there were errors that were corrected, or users posting items more than once.

Either way, there are still over 5400 submissions for both The Onion and World News which should be enough.  As this analysis will only examine the titles, no further investigations into duplicates will need to be performed and they can be dropped.

Before doing so however, it would be good to very null counts in each dataframe.

In [9]:
onion.isnull().sum(), wldnws.isnull().sum()

(subreddit                     0
 selftext                   5483
 title                         0
 subreddit_name_prefixed    1994
 whitelist_status              0
 created_utc                   0
 dtype: int64,
 subreddit                     0
 selftext                   5768
 title                         0
 subreddit_name_prefixed       0
 whitelist_status              0
 created_utc                   0
 dtype: int64)

### Observations:
As only the titles are being examined and there are no nulls in either column, null values do not need to be dropped.  The analysis will continue by dropping duplicates.

In [10]:
# Drop the duplicates for each dataset based on the title columns.
#  Source to find 'subset' argument:  Doc String
onion.drop_duplicates(subset = 'title', inplace=True)
wldnws.drop_duplicates(subset = 'title', inplace=True)
onion.shape, wldnws.shape

((5478, 6), (5413, 6))

In [11]:
# Find the sum of duplicated timestamps in the 'created_utc' column to determine if there are any duplicates missed by the previous steps.
onion['created_utc'].duplicated().sum()

52

In [12]:
# Filter the dataframe to show only timestamp duplicates so it can be determined if the titles look similar in any way.
#  Again, the keep is set to false to see all duplciated entries.
onion[onion['created_utc'].duplicated(keep = False) == True].sort_values('created_utc')

Unnamed: 0,subreddit,selftext,title,subreddit_name_prefixed,whitelist_status,created_utc
5019,TheOnion,,6 Ways To Make Your Neighbor Move Away Using N...,,all_ads,1590678208
5018,TheOnion,,‘Invest In My Blood Idea Before I Jab You’: An...,,all_ads,1590678208
5012,TheOnion,,White House Press Secretary: ‘Trump’s Critics ...,,all_ads,1590693023
5013,TheOnion,,Protestors Criticized For Looting Businesses W...,,all_ads,1590693023
5002,TheOnion,,Woman Needs To Shut Up And Allow Man To Be Ally,,all_ads,1590757811
...,...,...,...,...,...,...
110,TheOnion,,Report Reveals Only Jeff Bezos Finished Watchi...,r/TheOnion,all_ads,1680871722
36,TheOnion,,Tee Ball Coach Reminds Players To Use Both Han...,r/TheOnion,all_ads,1681898656
35,TheOnion,,Fan Respects Women Too Much To See Their Bodie...,r/TheOnion,all_ads,1681898656
4,TheOnion,,Dog And Owner Having Public Fight,r/TheOnion,all_ads,1682337233


### Observations:
A secondary check of the sumission creation times, 'created_utc' shows that there are some submissions that were posted at the same epoch time.  By sorting on the time it is clear that these are not duplicate title entries.

Thus, these duplciates can be ignored and do not need to be processed.

Let's examine the datasets again to eventually drop unneeded columns

In [13]:
onion.info(), onion.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5478 entries, 0 to 5995
Data columns (total 6 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   subreddit                5478 non-null   object
 1   selftext                 354 non-null    object
 2   title                    5478 non-null   object
 3   subreddit_name_prefixed  3685 non-null   object
 4   whitelist_status         5478 non-null   object
 5   created_utc              5478 non-null   int64 
dtypes: int64(1), object(5)
memory usage: 299.6+ KB


(None,
   subreddit selftext                                              title  \
 0  TheOnion      NaN  Idiot Tornado Tears Harmlessly Through Empty F...   
 1  TheOnion      NaN  New Texas Law Requires Schools To Display Imag...   
 2  TheOnion      NaN  New Poll Finds Americans Would Respect Biden M...   
 3  TheOnion      NaN  Could You Pass Racial Discrimination Training ...   
 4  TheOnion      NaN                  Dog And Owner Having Public Fight   
 
   subreddit_name_prefixed whitelist_status  created_utc  
 0              r/TheOnion          all_ads   1682364316  
 1              r/TheOnion          all_ads   1682363827  
 2              r/TheOnion          all_ads   1682361260  
 3              r/TheOnion          all_ads   1682337233  
 4              r/TheOnion          all_ads   1682337233  )

# Check Selftext Values
Determine if the selftext of any post is a continuation of the title, or contains relevant information

The columns 'subreddit_name_prefixed', 'whitelist_status', and 'created_utc' are no longer needed, but the selftext columns should be examined to determine if they contain any information such as an overflow of the title.

In [14]:
# Filter the dataframe to view only samples where the selftext is not null
onion[onion['selftext'].isna() == False]

Unnamed: 0,subreddit,selftext,title,subreddit_name_prefixed,whitelist_status,created_utc
87,TheOnion,[removed],Not the onion this time,r/TheOnion,all_ads,1681269849
126,TheOnion,[removed],Who has the video,r/TheOnion,all_ads,1680725923
215,TheOnion,[removed],Illinois Bakery Named Best Bakery In The Entir...,r/TheOnion,all_ads,1679661072
722,TheOnion,"Published July 7, 2004",Nation’s Liberals Suffering From Outrage Fatigue,r/TheOnion,all_ads,1671488700
808,TheOnion,[removed],The Dunning Kruger Effect,r/TheOnion,all_ads,1670122697
...,...,...,...,...,...,...
5916,TheOnion,[deleted],Swing Voter Really Relates To Buttigieg’s Comp...,r/TheOnion,all_ads,1579025700
5930,TheOnion,[deleted],Saudi authorities close down shop selling trad...,r/TheOnion,all_ads,1578785091
5943,TheOnion,[deleted],"An oldie, but a goodie",r/TheOnion,all_ads,1578599249
5962,TheOnion,[deleted],"Girl, 9, disappears using cream that makes you...",r/TheOnion,all_ads,1578369505


In [15]:
# Many entries are removed or deleted, but value counts can help find unique entries quickly
onion[onion['selftext'].isna() == False]['selftext'].value_counts()

[deleted]                 344
[removed]                   9
Published July 7, 2004      1
Name: selftext, dtype: int64

### Observations:
There are only three values populated into the selftext and none of them are useful.

This will be repeated on the World News data, and if the results are similar, the 'selftext' column in both dataframes will be dropped.

In [16]:
wldnws.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5413 entries, 0 to 5988
Data columns (total 6 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   subreddit                5413 non-null   object
 1   selftext                 198 non-null    object
 2   title                    5413 non-null   object
 3   subreddit_name_prefixed  5413 non-null   object
 4   whitelist_status         5413 non-null   object
 5   created_utc              5413 non-null   int64 
dtypes: int64(1), object(5)
memory usage: 296.0+ KB


In [17]:
# Filter the dataframe to view only samples where the selftext is not null
wldnws[wldnws['selftext'].isna() == False]

Unnamed: 0,subreddit,selftext,title,subreddit_name_prefixed,whitelist_status,created_utc
7,worldnews,[deleted],NL uses potentially biased algorithm for visa ...,r/worldnews,all_ads,1682376013
21,worldnews,[deleted],How Chinese censorship is quietly rewriting Co...,r/worldnews,all_ads,1682371953
52,worldnews,[removed],Russian State Outlet RT Reaches Out to Tucker ...,r/worldnews,all_ads,1682363126
78,worldnews,[removed],Brics draws membership requests from 19 nation...,r/worldnews,all_ads,1682356537
86,worldnews,[removed],Tucker Carlson Is Gone From Fox News,r/worldnews,all_ads,1682354338
...,...,...,...,...,...,...
5741,worldnews,[removed],Israeli Forces Draw Condemnation Over 'Barbari...,r/worldnews,all_ads,1680742365
5783,worldnews,[removed],"Taiwan leader, US Speaker McCarthy meet in Cal...",r/worldnews,all_ads,1680727519
5921,worldnews,[deleted],"Credit Suisse ‘could not be saved,’ chairman s...",r/worldnews,all_ads,1680702660
5935,worldnews,[removed],"In Poland, the Catholic church backed abortion...",r/worldnews,all_ads,1680699508


In [18]:
# Many entries are removed or deleted, but value counts can help find unique entries quickly
wldnws[wldnws['selftext'].isna() == False]['selftext'].value_counts()

[removed]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               

In [19]:
# These posts can be investigated to make sure that the titles provide useful information without the selftext
#  Creat a filter like previously, but with additional conditions to exclude cells where the selftext is 'removed' or 'deleted'

wldnws[(wldnws['selftext'].isna() == False) & (wldnws['selftext'] != '[removed]') & (wldnws['selftext'] != '[deleted]')]

Unnamed: 0,subreddit,selftext,title,subreddit_name_prefixed,whitelist_status,created_utc
666,worldnews,Twitter’s new rules limit the number of automa...,Disaster alert accounts are preparing for a wo...,r/worldnews,all_ads,1682169477
776,worldnews,This request was previously blocked by the US....,Israel holding advanced talks with Germany on ...,r/worldnews,all_ads,1682121969
804,worldnews,Air Canada's cargo operations were handling th...,What genius trusted Air Canada with a $20 Mill...,r/worldnews,all_ads,1682110918
828,worldnews,"Kupiansk downtown looks deserted: shops, pharm...",Constant Missile Attacks and Empty Streets: Ho...,r/worldnews,all_ads,1682100806
861,worldnews,"In 2021, a small boat carrying the bodies of m...","We are Renata Brito and Felipe Dana, journalis...",r/worldnews,all_ads,1682093522
1111,worldnews,Five Army soldiers were killed Thursday aftern...,"Five soldiers killed in J&amp;K terror attack,...",r/worldnews,all_ads,1682020248
1403,worldnews,Lula says that people with mental disorders ha...,Lula (Brazilian president) says that people wi...,r/worldnews,all_ads,1681948544
1503,worldnews,PROOF: https://i.redd.it/hzvrwow9qqua1.jpg\n\n...,I'm a tech journalist who's been covering how ...,r/worldnews,all_ads,1681919689


In [20]:
wldnws[(wldnws['selftext'].isna() == False) & (wldnws['selftext'] != '[removed]') & (wldnws['selftext'] != '[deleted]')].index

Int64Index([666, 776, 804, 828, 861, 1111, 1403, 1503], dtype='int64')

In [21]:
# To read this a bit better, let's print of the title and subtext for each, and format accordingly with f-strings to make it easier to read
#  In the first line, the code is simply grabbing the index of each value where the selftext is populated with something readable.
#  The loop then iterates through each index and prints the title and subtect for that index value using it to call from the column series.
for i in wldnws[(wldnws['selftext'].isna() == False) & (wldnws['selftext'] != '[removed]') & (wldnws['selftext'] != '[deleted]')].index:
    print(f"Title:  {wldnws['title'][i]} \nSelftext:  {wldnws['selftext'][i]} \n {'-'*200} \n")

Title:  Disaster alert accounts are preparing for a world after Twitter 
Selftext:  Twitter’s new rules limit the number of automated tweets an account can send out without paying, making it a less reliable place for emergency alerts 
 -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- 

Title:  Israel holding advanced talks with Germany on sale of Arrow 3 
Selftext:  This request was previously blocked by the US. Looks like its firmly back on the table. This is a joint US/Israeli system and is extremely capable. One of the very few ABM systems to intercept multiple targets simultaneously in testing. One of them was outside the atmosphere during intercept. 
 ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

### Observations:
* It can be seen that the titles are relatively complete and that the self text is some sort of short synopsis of the story.  These do not need to be included.
* It can also be seen that some characters like ampersands are not being processed well.  These should be converted if possible.

***Thus, for both the Onion and World News dataframes, the 'selftext' columns can be dropped.***

# Drop All Unneeded Columns

In [22]:
# Just keep only the columns that are needed
onion = onion[['subreddit', 'title']]
wldnws = wldnws[['subreddit', 'title']]

# Initial EDA / Cleaning

In [23]:
# Source to reset pandas column width:  https://pandas.pydata.org/docs/reference/api/pandas.reset_option.html
# Source to widen pandas columns:  https://stackoverflow.com/questions/11707586/how-do-i-expand-the-output-display-to-see-more-columns-of-a-pandas-dataframe
pd.reset_option('all')
pd.set_option('display.max_colwidth', 200)
onion.head(30)

: boolean
    use_inf_as_null had been deprecated and will be removed in a future
    version. Use `use_inf_as_na` instead.



Unnamed: 0,subreddit,title
0,TheOnion,Idiot Tornado Tears Harmlessly Through Empty Field - YouTube
1,TheOnion,New Texas Law Requires Schools To Display Image Of God Hung Like A Horse In Every Classroom
2,TheOnion,New Poll Finds Americans Would Respect Biden More If He Shot Them
3,TheOnion,Could You Pass Racial Discrimination Training At Fox News?
4,TheOnion,Dog And Owner Having Public Fight
5,TheOnion,Survey Finds Nearly 6 In 10 Wealthy Americans Living Fraud To Fraud
6,TheOnion,45-Year-Old Man Not A Fan Of What Teen Girls Doing To Eyebrows
7,TheOnion,Man Buys Wife Gun In Case She Ever Needs To Protect Herself From Him
8,TheOnion,remember when the onion used to be funny/quirky takes on life and not just libtard pandering
9,TheOnion,This is gold. Very underrated series


In [24]:
wldnws.head(30)

Unnamed: 0,subreddit,title
0,worldnews,"The parents of a 10-year-old boy living with autism, ADHD and anxiety are ""sickened"" after their son was mocked in a video created by a teacher, which was shown to staff and posted online"
2,worldnews,Mexico finds tons of liquid meth in tequila bottles at port
3,worldnews,"Tucker Carlson Leaving Fox News, Last Episode Already Aired"
4,worldnews,Film explores B.C. woman’s experience with magic mushrooms to ease cancer anxiety
5,worldnews,UK warns China secrecy over military expansion risks 'tragic miscalculation'
6,worldnews,Netherlands uses potentially biased algorithm for issuing visas
7,worldnews,NL uses potentially biased algorithm for visa applications
8,worldnews,"Drones attack Sevastopol, crashed drone found near Moscow"
9,worldnews,"Want your own private island? One is for sale in Scotland, but there's a catch"
10,worldnews,"Another Sudan ceasefire announced as locals seek refuge, foreigners are evacuated"


### Observations:
* World news titles may be longer
* Generally, the formats are the same, at approximately one sentense long
* In some cases, the new agency name is contained in the title (Like The New York Times)
* The Onion may use more coloquialisms and less formal language
* Both sets of posts contain very little ending punctuation like periods, exclamation point, etc.
* Oddities:
    * In World News - There are two posts which appear to be music videos, that should be removed along with any others
    * In the Onion - There is one post that doesn't appear to be a real onion post but rather someone's opinion.  It starts with a lower-casr letter which perhaps may indicate other posts of that nature

# Investigate and Remove Suspect Posts

#### Checks will be Performed on Both Datasets

### Remove Music Videos

In [25]:
# Source to help with string find:  https://www.programiz.com/python-programming/methods/string/find
onion[onion['title'].str.lower().str.find('music') > -1]

Unnamed: 0,subreddit,title
78,TheOnion,Aging Rock Musician Realizes It Time To Grow Up And Get Real Job As Jazz Musician
515,TheOnion,Musicians Explain Why They Hate Ticketmaster
545,TheOnion,Justin Bieber Sells Rights To Music Catalog For $200 Million
553,TheOnion,Biden Impregnates Popular Musician In Effort To Boost Approval Numbers
2272,TheOnion,Neil Young Demands Spotify Remove His Music Over Joe Rogan Vaccine Misinformation
2384,TheOnion,Shitty Music Has Helped Moron Through Hardest Times In His Pointless Life
2466,TheOnion,Artist Crafting Music Box Hopes It Delights At Least One Child In Post-Apocalypse
2918,TheOnion,Country Music Stars Challenge Al-Qaeda with Patriotic New Song 'Bomb New York'
5556,TheOnion,Damning New Footage Shows Sanders In 1980s Arguing Madonna Could Never Make Transition From Music To Film


In [26]:
# Find the Suspect posts in World News that contain Music
#  Source for find() string method:  https://www.programiz.com/python-programming/methods/string/find
wldnws[wldnws['title'].str.lower().str.find('music') > -1]

Unnamed: 0,subreddit,title
17,worldnews,Marley Jeanius - Krenglish (Intro) Official Music Video
18,worldnews,Marley Jeanius - Where Were You All This Time? Official Music Video
563,worldnews,"""Green Paradise"" Special Earth Day video, featuring David Attenborough and music by Wintergatan"
992,worldnews,Italy's antitrust takes steps against Meta in music rights case
1526,worldnews,Music NFT Artist Plays Jazz Saxophone
3120,worldnews,AI vs. Spotify: How ChatGPT is Changing the Game in Music Streaming
3791,worldnews,The History of Billboard Music A Century of Innovation and Adaptability
5311,worldnews,Four Winds Music Festival's pop-up orchestra brings together world-class musicians
5428,worldnews,Yeah Buddy! Band Releases First Music Video


All Onion articles containing music appear to be legitimate onion articles.  This is not the case for a new of the world news articles above.  They will be removed.

In [27]:
# Create a list of indices to be removed.
#  Since indexes are all unique values, sets can be used and easily concatentated.
#   Furhter, cells can be re-run wihtout risk of duplicating values as there would be for lists.
bad_news = {17, 18, 563}
bad_onions = set()  #  Source to make an empty set:  https://www.w3resource.com/python-exercises/sets/python-sets-exercise-1.php

### Investigate Lower Case Article Titles

In [28]:
# Look for titles where the entire title is lowercase which may indicate that the post is not a real onion article
#  Source for the islower() ... I searched for somethin gto tell me it's upper case by accident, and then realized there needed to be a similarly formatted counterpart for lower case:
    # https://careerkarma.com/blog/python-uppercase/#:~:text=Python%20Isupper(),-Before%20you%20convert&text=To%20check%20if%20a%20string,value%20depending%20on%20the%20outcome.
    # https://www.programiz.com/python-programming/methods/string/find
onion[onion['title'].str.islower() == True]

Unnamed: 0,subreddit,title
8,TheOnion,remember when the onion used to be funny/quirky takes on life and not just libtard pandering
383,TheOnion,"the terrible grammar group, here you can be proud of having bad grammar and protection against grammar nazis."
611,TheOnion,me irl
835,TheOnion,suddenlygay.jpg
930,TheOnion,dream's allergic to salt. my source is that it was revealed to me in my sleep
1066,TheOnion,"""social anxiety"""
1410,TheOnion,lmao
1640,TheOnion,bro wake up new spino just dropped
1817,TheOnion,the bibliography section of the wikipedia article
1827,TheOnion,no f way


In [29]:
# Perform the same filtering for World News
wldnws[wldnws['title'].str.islower() == True]

Unnamed: 0,subreddit,title
88,worldnews,#eating #banana #cuteanimals
606,worldnews,44percent of republicans dont want trump to run for reelection
961,worldnews,read for your skin my blog
1040,worldnews,https://gofund.me/de1f933e
1131,worldnews,5 most searched and best-selling books for free link
1279,worldnews,consult automotive manufacturer
1376,worldnews,tarot
1441,worldnews,3 school kids hospitalized after finding and ingesting suspected methamphetamine they thought was candy.
1960,worldnews,17-year-old girl found alive after 20 hours lost at sea
2111,worldnews,lemon8 🍋


#### Observations:

The Onion:
* Many of these are obviously not articles from the onion
* A few that look like they could be are not, except for the one beginning with "37 year old..." which has been verified to be a legitimate (but poorly copied) Onion article.
* Another article starting with "83%..." is citing a legitimate Onion but the actual article has a slightly different title.  Since this title also contains an indefinite article 'a' which is commonly not in Onion or even regular news article titles, this will be removed

World News:
* Like The Onion, many of these are not legitimate articles.  Investigating one of those which appears legitimate has shown that it was removed.  These will all be removed as they are suspect

These will all be cited for removal

In [30]:
# Create a list of indices to be removed from the data:
#  But keep the article cited above, index number 4901
#   Used the following sourece to figure out the exact syntax:  https://www.w3schools.com/python/gloss_python_join_sets.asp

bad_onions = bad_onions.union(set(onion[onion['title'].str.islower() == True].index))

# Use set subtraction to remove the legitimate title for the bad title set
bad_onions = bad_onions - {4901}

In [31]:
# Repeat the above process but for world news
bad_news = bad_news.union(set(wldnws[wldnws['title'].str.islower() == True].index))

### Investigate Really Short Titles
As seen above in the all lower-case investigation, some titles are simply too short to be actual news articles.  Some may contain capital letters which would not have been picked up in the previous example.

A word count feature will be added along with a title length feature.  Titles with relatively short titles will be investigated.

In [32]:
# Add Title Character Length Columns to both data frames
onion['title_len'] = onion['title'].apply(len)
wldnws['title_len'] = wldnws['title'].apply(len)

onion.head(2)

Unnamed: 0,subreddit,title,title_len
0,TheOnion,Idiot Tornado Tears Harmlessly Through Empty Field - YouTube,60
1,TheOnion,New Texas Law Requires Schools To Display Image Of God Hung Like A Horse In Every Classroom,91


In [33]:
# Add word count columns to both dataframes
onion['word_count'] = onion['title'].apply(lambda x: len(x.strip().split()))
wldnws['word_count'] = wldnws['title'].apply(lambda x: len(x.strip().split()))

onion.head(2)

Unnamed: 0,subreddit,title,title_len,word_count
0,TheOnion,Idiot Tornado Tears Harmlessly Through Empty Field - YouTube,60,9
1,TheOnion,New Texas Law Requires Schools To Display Image Of God Hung Like A Horse In Every Classroom,91,17


In [34]:
# Look at titles where the word count is less than 5 to start
onion[onion['word_count'] <= 5]

Unnamed: 0,subreddit,title,title_len,word_count
17,TheOnion,Stoner Architect Drafts All-Foyer Mansion,41,5
31,TheOnion,U.S. Politicians Who Switched Parties,37,5
46,TheOnion,Jimmy Carter Gets Vasectomy Reversed,36,5
63,TheOnion,Bye!,4,1
80,TheOnion,Political Profile: The Dalai Lama,33,5
...,...,...,...,...
5943,TheOnion,"An oldie, but a goodie",22,5
5953,TheOnion,A Timeline Of U.S.–Iran Relations,33,5
5954,TheOnion,Man With Nice Eyes Blown,24,5
5978,TheOnion,Me own paw ate it,17,5


In [35]:
#  ABOVE:  There are legitimate titles with only 5 words.
# Continue investigating title word lengths until they seem mostly suspicious so they can be removed

onion[onion['word_count'] <= 4]

Unnamed: 0,subreddit,title,title_len,word_count
63,TheOnion,Bye!,4,1
112,TheOnion,Critical Erase Theory,21,3
121,TheOnion,Facebook Entirely Memorialized Accounts,39,4
126,TheOnion,Who has the video,17,4
154,TheOnion,BlackPeopleTwitter mods,23,2
...,...,...,...,...
5738,TheOnion,Democrats Somehow Lose Primaries,32,4
5840,TheOnion,business is no more,19,4
5857,TheOnion,Latest discovery in fossils.,28,4
5880,TheOnion,Someone check his GSP,21,4


In [36]:
wldnws[wldnws['word_count'] <= 4]

Unnamed: 0,subreddit,title,title_len,word_count
42,worldnews,Darc Horce on TikTok,20,4
69,worldnews,Tucker Carlson OUT!,19,3
81,worldnews,Tucker Carlson is out,21,4
88,worldnews,#eating #banana #cuteanimals,28,3
95,worldnews,The laws of nature,18,4
...,...,...,...,...
5957,worldnews,أنواع كلب البحر,15,3
5959,worldnews,لماذا بنيت الأهرامات,20,3
5962,worldnews,قصص حقيقية قصيرة,16,3
5965,worldnews,قصص واقعية للمتزوجين,20,3


#### Observations:
* Spot checking the onion articles above has shown that many of the shorter titles are legitimate onion article posts.  They will be kept
* World news has a few issues:
    * Many of the titles shown above cannot be found when searched and appear to have been removed
    * Some titles are simply hashtags, and should probably be removed for lacking legitimacy
    * Some titles are in other languages with non-english characters which should be removed only because this analysis is focused on the english language for simplicity
    
The Onion dataset will be left alone, however, the World News dataset will need to be investigated.

### Remove World News Posts Which Do Not Contain English Letters

In [37]:
import config

In [38]:
# Source:  https://stackoverflow.com/questions/52455774/googletrans-stopped-working-with-error-nonetype-object-has-no-attribute-group
# https://deep-translator.readthedocs.io/en/latest/README.html
from deep_translator import GoogleTranslator
from deep_translator import single_detection


In [47]:
lang = single_detection('lemon8 🍋', api_key=config.detect_key)
lang

'en'

In [53]:
wldnws

Unnamed: 0,subreddit,title,title_len,word_count
0,worldnews,"The parents of a 10-year-old boy living with autism, ADHD and anxiety are ""sickened"" after their son was mocked in a video created by a teacher, which was shown to staff and posted online",187,34
2,worldnews,Mexico finds tons of liquid meth in tequila bottles at port,59,11
3,worldnews,"Tucker Carlson Leaving Fox News, Last Episode Already Aired",59,9
4,worldnews,Film explores B.C. woman’s experience with magic mushrooms to ease cancer anxiety,81,12
5,worldnews,UK warns China secrecy over military expansion risks 'tragic miscalculation',76,10
...,...,...,...,...
5984,worldnews,Diners in Japan arrested for dipping own chopsticks in communal bowl of ginger,78,13
5985,worldnews,200 Russian Journalists Sign Letter Demanding U.S. Reporter’s Release,69,9
5986,worldnews,Foxconn founder Gou to run for Taiwan presidency again,54,9
5987,worldnews,"A wartime NATO struggles to replace its chief | The job is suddenly politically sensitive and high-profile, driving people to float names like Ursula von der Leyen that are unlikely — for now.",192,33


In [55]:
wldnws['title'][0:1]

0    The parents of a 10-year-old boy living with autism, ADHD and anxiety are "sickened" after their son was mocked in a video created by a teacher, which was shown to staff and posted online
Name: title, dtype: object

In [50]:
single_detection(wldnws['title'][0], api_key=config.detect_key)

'en'

In [65]:
def lang_detector(title):
    try:
        return single_detection(title, api_key=config.detect_key)
    except:
        return 'error'
    

In [66]:
lang_detector('&')

'error'

In [70]:
test_df = wldnws.head().copy()

In [71]:
test_df['testing_column'] = test_df['title'].apply(lang_detector)

In [73]:
test_df

Unnamed: 0,subreddit,title,title_len,word_count,testing_column
0,worldnews,"The parents of a 10-year-old boy living with autism, ADHD and anxiety are ""sickened"" after their son was mocked in a video created by a teacher, which was shown to staff and posted online",187,34,en
2,worldnews,Mexico finds tons of liquid meth in tequila bottles at port,59,11,en
3,worldnews,"Tucker Carlson Leaving Fox News, Last Episode Already Aired",59,9,en
4,worldnews,Film explores B.C. woman’s experience with magic mushrooms to ease cancer anxiety,81,12,en
5,worldnews,UK warns China secrecy over military expansion risks 'tragic miscalculation',76,10,en


In [60]:
%%time
[single_detection(title, api_key=config.detect_key) for title in wldnws['title'][0:100]]

Wall time: 58.5 s


['en',
 'en',
 'en',
 'en',
 'en',
 'en',
 'en',
 'en',
 'en',
 'en',
 'en',
 'en',
 'en',
 'en',
 'en',
 'en',
 'en',
 'en',
 'en',
 'en',
 'en',
 'en',
 'en',
 'en',
 'en',
 'en',
 'en',
 'en',
 'en',
 'en',
 'en',
 'en',
 'en',
 'en',
 'en',
 'en',
 'en',
 'en',
 'en',
 'en',
 'en',
 'en',
 'en',
 'en',
 'en',
 'en',
 'en',
 'en',
 'en',
 'en',
 'en',
 'en',
 'en',
 'en',
 'en',
 'en',
 'en',
 'en',
 'en',
 'en',
 'en',
 'en',
 'en',
 'en',
 'en',
 'en',
 'en',
 'en',
 'en',
 'en',
 'en',
 'en',
 'en',
 'en',
 'en',
 'en',
 'en',
 'en',
 'en',
 'en',
 'en',
 'en',
 'en',
 'en',
 'en',
 'en',
 'en',
 'en',
 'en',
 'en',
 'en',
 'en',
 'en',
 'en',
 'en',
 'en',
 'en',
 'en',
 'en',
 'en']

In [41]:
# SOURCE:  https://towardsdatascience.com/4-python-libraries-to-detect-english-and-non-english-language-c82ad3efd430
# Source:  https://py-googletrans.readthedocs.io/en/latest/

#from googletrans import Translator

In [None]:
wldnws['title'][5957]

In [None]:
[title for title in wldnws['title'] if len(re.findall('[ء-ي]+', title)) != 0]


#len(re.findall('\w+', '1&&'))

In [None]:
'Mexico finds tons of liquid meth in tequila bottles at port'.encode('utf-8')

In [None]:
'\w+', 'أنواع كلب البحر'.encode('utf-8').decode('utf-8')

In [None]:
len(re.findall('\w+', 'أنواع كلب البحر'))

In [None]:
wldnws[wldnws['title'].str.isalpha() == False]

In [None]:
wldnws[wldnws['title'].str.find('#') > -1]

---
# JUNK!

In [None]:
# Source:  https://www.geeksforgeeks.org/python-ways-to-concatenate-two-lists/

test_list3 = [1, 4, 5, 6, 5]
test_list4 = [3, 5, 7, 2, 5]
 
# using list.extend() to concat
test_list3.extend(test_list4)

In [None]:

# set of letters
GEEK = {'g', 'e', 'k'}
 
# adding 's'
GEEK.add('s')

In [None]:
GEEK

In [None]:
keel = {4, 3, 5}

In [None]:
GEEK  keel

In [None]:
test_list3

In [None]:
l = ['Alice', 'Bob', 'Charlie', 'Bob', 'Dave']

k = l.remove('Alice')

l

In [None]:
'Blah blah fun blah blah'[0].islower()

In [None]:
# Source:  https://towardsdatascience.com/4-python-libraries-to-detect-english-and-non-english-language-c82ad3efd430
# Source:  https://textblob.readthedocs.io/en/dev/

from textblob import TextBlob

In [None]:
b = TextBlob("hate - love relationship")
b.subjectivity

In [39]:
GoogleTranslator(source='auto', target='en').translate("أنواع كلب البحر") 

'Types of sea dog'

In [None]:
# junky junk