# Clean articles

## Initialize Cleaning

### Import Packages and Raw_data

In [1]:
import pandas as pd
from functools import wraps
import datetime as dt
import numpy as np

raw_file_name = "G_2022_climate_raw"
cleaned_file_name = "G_2022_climate_clean" 

init_df = pd.read_csv(f"../../data/raw/{raw_file_name}.csv")
# init_df = pd.read_csv(f"../../data/raw/{raw_file_name}.csv.zip", compression="zip")
init_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32726 entries, 0 to 32725
Data columns (total 17 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   id                  32726 non-null  object
 1   type                32726 non-null  object
 2   sectionId           32726 non-null  object
 3   sectionName         32726 non-null  object
 4   webPublicationDate  32726 non-null  object
 5   webTitle            32726 non-null  object
 6   webUrl              32726 non-null  object
 7   apiUrl              32726 non-null  object
 8   fields              32726 non-null  object
 9   tags                32726 non-null  object
 10  references          32726 non-null  object
 11  blocks              32726 non-null  object
 12  rights              32726 non-null  object
 13  section             32708 non-null  object
 14  isHosted            32726 non-null  bool  
 15  pillarId            32485 non-null  object
 16  pillarName          32

### Logging

In [2]:
def log_step(func):
    @wraps(func)
    def wrapper(*args, **kwargs):
        tic = dt.datetime.now()
        result = func(*args, **kwargs)
        time_taken = str(dt.datetime.now() - tic)
        print(f"{func.__name__}:\n shape={result.shape} took {time_taken}s\n")
        return result

    return wrapper

## Pipeline - Functions

### Start Pipeline

In [3]:
@log_step
def init_pipeline(df):
    return df.copy()

### Unfold Columns

In [None]:
@log_step
def unfold_columns(df):
    dict_cols = ["fields", "rights"] 
    for col in dict_cols:
        new_df = pd.DataFrame()
        new_df[col] = df[col].apply(lambda x: eval(x))
        add_cols_df = pd.json_normalize(new_df[col])
        df = pd.concat([df, add_cols_df], axis=1)
    
    # Tags extraction
    
    df["tags"] = df["tags"].apply(lambda x: eval(x))
    df['tagWebTitle'] = df['tags'].map(lambda x:[i['webTitle'] for i in x])
    df['tagId'] = df['tags'].map(lambda x:[i['id'] for i in x])
    df = df.drop(columns="tags")
    return df.drop(columns=dict_cols)

### Remove duplicates

In [6]:
@log_step
def remove_duplicates(df):
        return df.drop_duplicates()

### Missing Values

In [7]:
@log_step
def missing_values(df):
    
    return df

### Adjust data types

In [8]:
@log_step
def adjust_data_types(df):
       
   # Strings
   df["id"] = df["id"].astype("string")
   df["sectionName"] = df["sectionName"].astype("string")
   df["type"] = df["type"].astype("string")
   df["webTitle"] = df["webTitle"].astype("string")
   df["pillarName"] = df["pillarName"].astype("string")
   df["headline"] = df["headline"].astype("string")
   df["bodyText"] = df["bodyText"].astype("string")
   
   # Numerical
   df["charCount"] = df["charCount"].astype("int")


   #Categorical
   df["productionOffice"] = df["productionOffice"].astype("category")
   

   # boolean
   df["isLive"] = df["isLive"].map({"true": True, "false": False})
   df["syndicatable"] = df["syndicatable"].map({"true": True, "false": False})
   df["commentable"] = df["commentable"].map({"true": True, np.nan: False})
   
   # Datetimes
   df["webPublicationDate"] = pd.to_datetime(df["webPublicationDate"])
   df["firstPublicationDate"] = pd.to_datetime(df["firstPublicationDate"])
   
   return df

### Drop Columns

In [9]:
@log_step
def drop_columns(df):
    cols=["webUrl","apiUrl", "isHosted", "pillarId"]
    return df.drop(columns=cols)

# @log_step
# def drop_columns(df):
#     cols =[
#         "trailText",
#         "main",
#         "body",
#         "webUrl",
#         "apiUrl",
#         "standfirst",
#         "section",
#         #"tags",
#         "newspaperPageNumber",
#         "newspaperEditionDate",
#         "byline",
#         "pillarId",
#         "bylineHtml",
#         "displayHint",
#         "commentCloseDate",
#         "isInappropriateForSponsorship",
#         "isPremoderated",
#         "liveBloggingNow",
#         "publication",
#         "shouldHideAdverts",
#         "shortUrl",
#         "showInRelatedContent",
#         "thumbnail",
#         "shouldHideReaderRevenue",
#         "lastModified",
#         "legallySensitive",
#         "lang",
#         "showAffiliateLinks",
#         "wordcount",
#         "sensitive",
#         "sectionId",
#         "subscriptionDatabases",
#         "developerCommunity",
#         "contributorBio",
#         "scheduledPublicationDate"
#         ]
#     return df.drop(columns=cols)

### CSV

In [11]:
@log_step
def create_csv(df):
    df.to_csv(f"../data/{cleaned_file_name}.csv", index = False, header=True)
    return df

# @log_step
# def create_csv(df):
#     df.to_csv(f"../data/{file_name}_clean.csv.zip", index=False, compression="zip")
#     df.info()
#     return df


## Run Cleaning

In [12]:
articles = (
init_df
    .pipe(init_pipeline)       
    .pipe(remove_duplicates)    
    .pipe(unfold_columns)       
    .pipe(drop_columns)          
    .pipe(adjust_data_types)    
    .pipe(missing_values)    
    .pipe(create_csv)      
)


start_pipeline:
 shape=(32726, 17) took 0:00:00.005984s

remove_duplicates:
 shape=(32713, 17) took 0:00:05.866353s

add_columns:
 shape=(32713, 17) took 0:00:00s

drop_columns:
 shape=(32713, 13) took 0:00:00.005984s

adjust_data_types:
 shape=(32713, 13) took 0:00:00.048975s

create_csv:
 shape=(32713, 13) took 0:00:35.377413s



In [None]:
articles.isna().sum()

In [13]:
articles.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32713 entries, 0 to 32725
Data columns (total 13 columns):
 #   Column              Non-Null Count  Dtype              
---  ------              --------------  -----              
 0   article_id          32713 non-null  string             
 1   article_type        32713 non-null  string             
 2   sectionId           32713 non-null  object             
 3   sectionName         32713 non-null  object             
 4   webPublicationDate  32713 non-null  datetime64[ns, UTC]
 5   webTitle            32713 non-null  object             
 6   fields              32713 non-null  object             
 7   tags                32713 non-null  object             
 8   references          32713 non-null  object             
 9   blocks              32713 non-null  object             
 10  rights              32713 non-null  object             
 11  section             32695 non-null  object             
 12  pillarName          32472 non-nu

In [14]:
articles.sample(5)

Unnamed: 0,article_id,article_type,sectionId,sectionName,webPublicationDate,webTitle,fields,tags,references,blocks,rights,section,pillarName
2212,sport/2022/may/21/emma-raducanu-ready-for-deja...,article,sport,Sport,2022-05-21 21:30:30+00:00,Emma Raducanu ready for deja vu against qualif...,{'headline': 'Emma Raducanu ready for deja vu ...,"[{'id': 'sport/emma-raducanu', 'type': 'keywor...",[{'id': 'rich-link/https://www.theguardian.com...,"{'main': {'id': '6288f7df8f08a8124b1507ef', 'b...","{'syndicatable': 'true', 'subscriptionDatabase...","{'id': 'sport', 'webTitle': 'Sport', 'webUrl':...",Sport
30292,science/2022/jan/13/letter-richard-leakey-obit...,article,science,Science,2022-01-13 15:26:17+00:00,Letter: Richard Leakey obituary,{'headline': 'Letter: Richard Leakey obituary'...,"[{'id': 'science/evolution', 'type': 'keyword'...",[],"{'main': {'id': '61e022378f085adc21a85164', 'b...","{'syndicatable': 'true', 'subscriptionDatabase...","{'id': 'science', 'webTitle': 'Science', 'webU...",News
17198,money/2022/mar/14/uk-children-raise-ukraine-fu...,article,money,Money,2022-03-14 11:29:06+00:00,UK children raise funds for Ukraine with hairc...,{'headline': 'UK children raise funds for Ukra...,"[{'id': 'money/charitable-giving', 'type': 'ke...",[{'id': 'rich-link/https://www.theguardian.com...,"{'main': {'id': '622f22458f08527b368564c9', 'b...","{'syndicatable': 'true', 'subscriptionDatabase...","{'id': 'money', 'webTitle': 'Money', 'webUrl':...",Lifestyle
5052,us-news/2022/may/09/mississippi-governor-contr...,article,us-news,US news,2022-05-09 15:36:55+00:00,Mississippi governor refuses to rule out banni...,{'headline': 'Mississippi governor refuses to ...,"[{'id': 'us-news/mississippi', 'type': 'keywor...",[],"{'main': {'id': '627933098f084fe9084bc699', 'b...","{'syndicatable': 'true', 'subscriptionDatabase...","{'id': 'us-news', 'webTitle': 'US news', 'webU...",News
30350,sport/2022/jan/13/jim-harbaugh-nfl-comeback-ru...,article,sport,Sport,2022-01-13 10:30:29+00:00,Jim Harbaugh’s likely NFL return shows college...,{'headline': 'Jim Harbaugh’s likely NFL return...,"[{'id': 'sport/nfl', 'type': 'keyword', 'secti...",[],"{'main': {'id': '61dfc16e8f08b14fee0b3582', 'b...","{'syndicatable': 'true', 'subscriptionDatabase...","{'id': 'sport', 'webTitle': 'Sport', 'webUrl':...",Sport


In [15]:
articles.webPublicationDate.describe()

  articles.webPublicationDate.describe()


count                         32713
unique                        25665
top       2022-03-06 06:00:45+00:00
freq                              9
first     2022-01-01 00:00:33+00:00
last      2022-05-31 23:44:57+00:00
Name: webPublicationDate, dtype: object

In [None]:
# test = pd.read_csv(f"../data/{file_name}.csv.zip", compression="zip")

In [None]:
# test["tags"] = test["tags"].apply(lambda x: eval(x))
# test['tagWebTitle'] = test['tags'].map(lambda x:[i['webTitle'] for i in x])
# test['tagId'] = test['tags'].map(lambda x:[i['id'] for i in x])
# test = test.drop(columns="tags")
# test.info()