# Data Cleaning

In [2]:
import pandas as pd

df0 = pd.read_parquet("data/original_data_0.parquet")
df1 = pd.read_parquet("data/original_data_1.parquet")
df2 = pd.read_parquet("data/original_data_2.parquet")
df3 = pd.read_parquet("data/original_data_3.parquet")
df4 = pd.read_parquet("data/original_data_4.parquet")

In [3]:
df0.head()

Unnamed: 0,ID,publishedAt,instances,source-id,source-name,en-title,language
0,12436,2020-08-06T09:21:27Z,"[{'category': 'general', 'collectedAt': '2020-...",,Albidda.net,A doctor warns of new symptoms of “Corona” tha...,ar
1,12541,2020-08-06T15:45:39Z,"[{'category': 'general', 'collectedAt': '2020-...",,Middle East Online,Foldable phones lead Samsung to climb the top ...,ar
2,12568,2020-08-06T16:43:05Z,"[{'category': 'general', 'collectedAt': '2020-...",,Alanba.com.kw,Explosive stars create calcium in - Kuwait New...,ar
3,12795,2020-08-07T07:08:19Z,"[{'category': 'general', 'collectedAt': '2020-...",,Al-ain.com,Twitter secures the accounts of governments an...,ar
4,13129,2020-08-07T13:33:00Z,"[{'category': 'general', 'collectedAt': '2020-...",,محليات,Corona patients without symptoms carry a viral...,ar


In [4]:
df0["instances"].dtype

dtype('O')

In [5]:
type(df0["instances"].iloc[0])

numpy.ndarray

In [6]:
# View the first instance in the first dataframe
df0["instances"].iloc[0]

array([{'category': 'general', 'collectedAt': '2020-08-08T10:03:00Z', 'location': 'ae'}],
      dtype=object)

### Extract Location from "Instances" Column

In [18]:
def extract_location(row):
    return row[0].get("location")


df0["location_code"] = df0["instances"].apply(extract_location)
df1["location_code"] = df1["instances"].apply(extract_location)
df2["location_code"] = df2["instances"].apply(extract_location)
df3["location_code"] = df3["instances"].apply(extract_location)
df4["location_code"] = df4["instances"].apply(extract_location)
df0.head()

Unnamed: 0,ID,publishedAt,instances,source-id,source-name,en-title,language,location_code,location
0,12436,2020-08-06T09:21:27Z,"[{'category': 'general', 'collectedAt': '2020-...",,Albidda.net,A doctor warns of new symptoms of “Corona” tha...,ar,ae,United Arab Emirates
1,12541,2020-08-06T15:45:39Z,"[{'category': 'general', 'collectedAt': '2020-...",,Middle East Online,Foldable phones lead Samsung to climb the top ...,ar,ae,United Arab Emirates
2,12568,2020-08-06T16:43:05Z,"[{'category': 'general', 'collectedAt': '2020-...",,Alanba.com.kw,Explosive stars create calcium in - Kuwait New...,ar,ae,United Arab Emirates
3,12795,2020-08-07T07:08:19Z,"[{'category': 'general', 'collectedAt': '2020-...",,Al-ain.com,Twitter secures the accounts of governments an...,ar,ae,United Arab Emirates
4,13129,2020-08-07T13:33:00Z,"[{'category': 'general', 'collectedAt': '2020-...",,محليات,Corona patients without symptoms carry a viral...,ar,ae,United Arab Emirates


### Map to Country

In [16]:
location_map = {
    "ae": "United Arab Emirates",
    "ar": "Argentina",
    "ph": "Philippines",
    "ng": "Nigeria",
    "in": "India",
    "us": "United States",
    "ca": "Canada",
    "sa": "Saudi Arabia",
    "cu": "Cuba",
    "au": "Australia",
    "br": "Brazil",
    "ma": "Morocco",
    "id": "Indonesia",
    "eg": "Egypt",
    "it": "Italy",
    "gb": "United Kingdom",
    "ie": "Ireland",
    "mx": "Mexico",
    "tr": "Turkey",
    "gr": "Greece",
    "de": "Germany",
    "jp": "Japan",
    "za": "South Africa",
    "fr": "France",
    "pl": "Poland",
    "pt": "Portugal",
    "co": "Colombia",
    "my": "Malaysia",
    "ru": "Russian Federation",
    "at": "Austria",
    "nz": "New Zealand",
    "tw": "Taiwan",
    "nl": "Netherlands",
    "sg": "Singapore",
    "be": "Belgium",
    "cn": "China",
    "ve": "Venezuela",
    "th": "Thailand",
    "se": "Sweden",
    "kr": "Korea",
    "hk": "Hong Kong",
    "rs": "Serbia",
    "hu": "Hungary",
    "cz": "Czechia",
    "ch": "Switzerland",
    "il": "Israel",
    "bg": "Bulgaria",
    "ua": "Ukraine",
    "ro": "Romania",
    "no": "Norway",
    "sk": "Slovakia",
    "lv": "Latvia",
    "lt": "Lithuania",
    "si": "Slovenia"
}

In [20]:
df0["location"] = df0["location_code"].map(location_map)
df1["location"] = df1["location_code"].map(location_map)
df2["location"] = df2["location_code"].map(location_map)
df3["location"] = df3["location_code"].map(location_map)
df4["location"] = df4["location_code"].map(location_map)

df0.head()

Unnamed: 0,ID,publishedAt,instances,source-id,source-name,en-title,language,location_code,location
0,12436,2020-08-06T09:21:27Z,"[{'category': 'general', 'collectedAt': '2020-...",,Albidda.net,A doctor warns of new symptoms of “Corona” tha...,ar,ae,United Arab Emirates
1,12541,2020-08-06T15:45:39Z,"[{'category': 'general', 'collectedAt': '2020-...",,Middle East Online,Foldable phones lead Samsung to climb the top ...,ar,ae,United Arab Emirates
2,12568,2020-08-06T16:43:05Z,"[{'category': 'general', 'collectedAt': '2020-...",,Alanba.com.kw,Explosive stars create calcium in - Kuwait New...,ar,ae,United Arab Emirates
3,12795,2020-08-07T07:08:19Z,"[{'category': 'general', 'collectedAt': '2020-...",,Al-ain.com,Twitter secures the accounts of governments an...,ar,ae,United Arab Emirates
4,13129,2020-08-07T13:33:00Z,"[{'category': 'general', 'collectedAt': '2020-...",,محليات,Corona patients without symptoms carry a viral...,ar,ae,United Arab Emirates


### Extract Article Category

In [7]:
# Extract Article Category
def extract_category(row):
    return row[0].get("category")


df0["category"] = df0["instances"].apply(extract_category)
df1["category"] = df1["instances"].apply(extract_category)
df2["category"] = df2["instances"].apply(extract_category)
df3["category"] = df3["instances"].apply(extract_category)
df4["category"] = df4["instances"].apply(extract_category)
df0.head()

Unnamed: 0,ID,publishedAt,instances,source-id,source-name,en-title,language,category
0,12436,2020-08-06T09:21:27Z,"[{'category': 'general', 'collectedAt': '2020-...",,Albidda.net,A doctor warns of new symptoms of “Corona” tha...,ar,general
1,12541,2020-08-06T15:45:39Z,"[{'category': 'general', 'collectedAt': '2020-...",,Middle East Online,Foldable phones lead Samsung to climb the top ...,ar,general
2,12568,2020-08-06T16:43:05Z,"[{'category': 'general', 'collectedAt': '2020-...",,Alanba.com.kw,Explosive stars create calcium in - Kuwait New...,ar,general
3,12795,2020-08-07T07:08:19Z,"[{'category': 'general', 'collectedAt': '2020-...",,Al-ain.com,Twitter secures the accounts of governments an...,ar,general
4,13129,2020-08-07T13:33:00Z,"[{'category': 'general', 'collectedAt': '2020-...",,محليات,Corona patients without symptoms carry a viral...,ar,general


In [8]:
# Check unique categories across all dataframes
unique_categories = set()
for df in [df0, df1, df2, df3, df4]:
    unique_categories.update(df["category"].unique())

unique_categories

{'business',
 'entertainment',
 'general',
 'health',
 'science',
 'sports',
 'technology'}

### Extract Year and Month from "Published At" Column

In [13]:
# View the first instance of PublishedAt
df0["publishedAt"].iloc[0]

'2020-08-06T09:21:27Z'

In [15]:
# Extract Year and Month from "Published At" Column
def extract_year_month(date_str):
    if pd.isna(date_str) or date_str is None:
        return None, None
    try:
        date_obj = pd.to_datetime(date_str)
        return date_obj.year, date_obj.month
    except:
        return None, None

df0[["year", "month"]] = df0["publishedAt"].apply(lambda x: pd.Series(extract_year_month(x)))
df1[["year", "month"]] = df1["publishedAt"].apply(lambda x: pd.Series(extract_year_month(x)))
df2[["year", "month"]] = df2["publishedAt"].apply(lambda x: pd.Series(extract_year_month(x)))
df3[["year", "month"]] = df3["publishedAt"].apply(lambda x: pd.Series(extract_year_month(x)))
df4[["year", "month"]] = df4["publishedAt"].apply(lambda x: pd.Series(extract_year_month(x)))
df0.head()

Unnamed: 0,ID,publishedAt,instances,source-id,source-name,en-title,language,category,year,month
0,12436,2020-08-06T09:21:27Z,"[{'category': 'general', 'collectedAt': '2020-...",,Albidda.net,A doctor warns of new symptoms of “Corona” tha...,ar,general,2020.0,8.0
1,12541,2020-08-06T15:45:39Z,"[{'category': 'general', 'collectedAt': '2020-...",,Middle East Online,Foldable phones lead Samsung to climb the top ...,ar,general,2020.0,8.0
2,12568,2020-08-06T16:43:05Z,"[{'category': 'general', 'collectedAt': '2020-...",,Alanba.com.kw,Explosive stars create calcium in - Kuwait New...,ar,general,2020.0,8.0
3,12795,2020-08-07T07:08:19Z,"[{'category': 'general', 'collectedAt': '2020-...",,Al-ain.com,Twitter secures the accounts of governments an...,ar,general,2020.0,8.0
4,13129,2020-08-07T13:33:00Z,"[{'category': 'general', 'collectedAt': '2020-...",,محليات,Corona patients without symptoms carry a viral...,ar,general,2020.0,8.0


In [17]:
# Cast year and month to integer type
df0["year"] = df0["year"].astype("Int64")
df0["month"] = df0["month"].astype("Int64")
df1["year"] = df1["year"].astype("Int64")
df1["month"] = df1["month"].astype("Int64")
df2["year"] = df2["year"].astype("Int64")   
df2["month"] = df2["month"].astype("Int64")
df3["year"] = df3["year"].astype("Int64")
df3["month"] = df3["month"].astype("Int64")
df4["year"] = df4["year"].astype("Int64")
df4["month"] = df4["month"].astype("Int64")

In [18]:
# Save cleaned dataframes to new parquet files
df0.to_parquet("data/cleaned_data_0.parquet", index=False)
df1.to_parquet("data/cleaned_data_1.parquet", index=False)
df2.to_parquet("data/cleaned_data_2.parquet", index=False)
df3.to_parquet("data/cleaned_data_3.parquet", index=False)
df4.to_parquet("data/cleaned_data_4.parquet", index=False)