# AI Perceptions Project

1. Cleaning the data set

In [1]:
import pandas as pd

#Reading the Corpus
articles=pd.read_csv('../data/g_df', sep="\t")

In [2]:
# check the type of the articles in the corpus
articles['type'].value_counts()

article        11522
liveblog         527
video             27
gallery           27
audio             12
picture           12
interactive       11
Name: type, dtype: int64

In [3]:
#subset to articles of type 'article'
article=articles[articles['type']=='article']

In [4]:
# filter for NA or NAN values in the column body_text and wordcount of 0
article=article[article['wordcount']!=0]
article=article[article['body_text'].notna()]

In [5]:
# double check whether the API has queried the correct articles
# filter the body_text column for the word 'machine learning' or 'AI' and put into a new dataframe
query_check_df = article[article['body_text'].str.contains('machine learning|machine-learning|AI|artificial intelligence|artificial-intelligence|deep-learning|deep learning|intelligent machines|BERT|GPT-3|deep mind|DeepMind')]

# check the length of the dataframe
query_check_df.shape

(6814, 47)

In [6]:
#Cleaning html tags from the news articles' body
from bs4 import BeautifulSoup
#Function cleaning html tags
def clean_body(html_text):
    cleantext = BeautifulSoup(html_text, "lxml").text
    return cleantext

#Application to the dataframe
query_check_df["text"]=query_check_df["body"].astype("str").apply(lambda x: clean_body(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  query_check_df["text"]=query_check_df["body"].astype("str").apply(lambda x: clean_body(x))


In [7]:
#Restarting the index to get the appropriate number of articles
query_check_df=query_check_df.reset_index(drop=True)
query_check_df.shape

(6814, 48)

In [8]:
#Removing URLS and Emails from the dataset
#Removing URLS starting with http or www.
query_check_df['text'] = query_check_df['text'].str.replace('http\S+|www.\S+', '', case=False)
#Removing Emails
query_check_df['text'] = query_check_df['text'].replace('[\w\.-]+@[\w\.-]+', '', regex=True)
#Removing URLS without http or www.
query_check_df['text'] = query_check_df['text'].replace('[\w\.-]+\.[\w\.-]+', '', regex=True)

  query_check_df['text'] = query_check_df['text'].str.replace('http\S+|www.\S+', '', case=False)


In [9]:
#Removing "THE GUARDIAN" from news' text
query_check_df['text'] = query_check_df['text'].replace("THE GUARDIAN", '', regex=True)

In [11]:
#import nltk

In [12]:
#nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/dinah/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [10]:
from nltk import tokenize
#Creating a copy of the dataframe with a column including the sentences per text
q_df=query_check_df
q_df["text_sentences"]=query_check_df.text.apply(lambda x: tokenize.sent_tokenize(x))
clean_articles=q_df.assign(sentence=q_df.text_sentences).explode('sentence')
#clean_articles=query_check_df.assign(sentence=query_check_df.text.str.split(".")).explode('sentence')

# Using the previous index to identify the article number and reseting the index
clean_articles.index.name="article_nr"
clean_articles=clean_articles.reset_index()


In [14]:
clean_articles.tail(10)

Unnamed: 0,article_nr,id,type,section_id,section_name,web_publication_date,web_title,web_url,api_url,is_hosted,...,display_hint,star_rating,sensitive,live_blogging_now,contributor_bio,allow_ugc,scheduled_publication_date,text,text_sentences,sentence
316247,6813,world/2000/jan/01/millennium.uk1,article,us-news,US news,2000-01-01T01:54:17Z,Born to be wired,https://www.theguardian.com/world/2000/jan/01/...,https://content.guardianapis.com/world/2000/ja...,False,...,,,,,,,,"The baby born today, living an average life, a...","[The baby born today, living an average life, ...","Then again, events will probably jolt the self..."
316248,6813,world/2000/jan/01/millennium.uk1,article,us-news,US news,2000-01-01T01:54:17Z,Born to be wired,https://www.theguardian.com/world/2000/jan/01/...,https://content.guardianapis.com/world/2000/ja...,False,...,,,,,,,,"The baby born today, living an average life, a...","[The baby born today, living an average life, ...","Around 2040, says Yorick Blumenfeld, a rare pe..."
316249,6813,world/2000/jan/01/millennium.uk1,article,us-news,US news,2000-01-01T01:54:17Z,Born to be wired,https://www.theguardian.com/world/2000/jan/01/...,https://content.guardianapis.com/world/2000/ja...,False,...,,,,,,,,"The baby born today, living an average life, a...","[The baby born today, living an average life, ...",So few people will have so much economic power...
316250,6813,world/2000/jan/01/millennium.uk1,article,us-news,US news,2000-01-01T01:54:17Z,Born to be wired,https://www.theguardian.com/world/2000/jan/01/...,https://content.guardianapis.com/world/2000/ja...,False,...,,,,,,,,"The baby born today, living an average life, a...","[The baby born today, living an average life, ...","Or maybe these islands will be invaded, by a p..."
316251,6813,world/2000/jan/01/millennium.uk1,article,us-news,US news,2000-01-01T01:54:17Z,Born to be wired,https://www.theguardian.com/world/2000/jan/01/...,https://content.guardianapis.com/world/2000/ja...,False,...,,,,,,,,"The baby born today, living an average life, a...","[The baby born today, living an average life, ...","Or, as British Telecom's Atlas of the Future c..."
316252,6813,world/2000/jan/01/millennium.uk1,article,us-news,US news,2000-01-01T01:54:17Z,Born to be wired,https://www.theguardian.com/world/2000/jan/01/...,https://content.guardianapis.com/world/2000/ja...,False,...,,,,,,,,"The baby born today, living an average life, a...","[The baby born today, living an average life, ...",The millennium baby had better learn to use a ...
316253,6813,world/2000/jan/01/millennium.uk1,article,us-news,US news,2000-01-01T01:54:17Z,Born to be wired,https://www.theguardian.com/world/2000/jan/01/...,https://content.guardianapis.com/world/2000/ja...,False,...,,,,,,,,"The baby born today, living an average life, a...","[The baby born today, living an average life, ...",21st-century life at a glance 2010: Entire hum...
316254,6813,world/2000/jan/01/millennium.uk1,article,us-news,US news,2000-01-01T01:54:17Z,Born to be wired,https://www.theguardian.com/world/2000/jan/01/...,https://content.guardianapis.com/world/2000/ja...,False,...,,,,,,,,"The baby born today, living an average life, a...","[The baby born today, living an average life, ...",2020: Average lifespan in developed world reac...
316255,6813,world/2000/jan/01/millennium.uk1,article,us-news,US news,2000-01-01T01:54:17Z,Born to be wired,https://www.theguardian.com/world/2000/jan/01/...,https://content.guardianapis.com/world/2000/ja...,False,...,,,,,,,,"The baby born today, living an average life, a...","[The baby born today, living an average life, ...",2040: Satellite colonies successfully establis...
316256,6813,world/2000/jan/01/millennium.uk1,article,us-news,US news,2000-01-01T01:54:17Z,Born to be wired,https://www.theguardian.com/world/2000/jan/01/...,https://content.guardianapis.com/world/2000/ja...,False,...,,,,,,,,"The baby born today, living an average life, a...","[The baby born today, living an average life, ...",2100: No clear distinction between humans and ...


In [15]:
# check the length of the dataframe
clean_articles.shape

(316257, 51)

In [16]:
# remove all articles that are not in English
full_df=clean_articles[clean_articles['lang']=='en']


In [17]:
#subset the dataframe to the columns article_nr, sentence, wordcount, section_name publication_date, web_publication_date, headline, web_title, production_office, publication
final_df=clean_articles[['article_nr','sentence','wordcount','section_name','web_publication_date','headline','web_title','production_office','publication']]

In [20]:
# save dataframe to file
#full_df.to_csv('full_df.tsv', sep="\t")
final_df.to_csv('final_df.tsv', sep="\t")

In [23]:
query_check_df.shape

(6813, 49)

In [22]:
# remove all articles that are not in English from query_check_df
query_check_df=query_check_df[query_check_df['lang']=='en']

In [25]:
d_df = query_check_df[['body','body_text','wordcount','section_name','web_publication_date','headline','web_title','production_office','publication']]

In [26]:
# give all column names of the dataframe
d_df.to_csv('d_df.tsv', sep="\t")