<a href="https://colab.research.google.com/github/gylam/siads696-sprsum2024-team05/blob/main/2_data_manipulation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from collections import Counter

#### Read in data

In [None]:
data_path = '..\data\gl_full_pickle.pickle'
df_raw = pd.read_pickle(data_path)
print(df_raw.shape)

(459937, 16)


In [None]:
#explode themes
df_raw = df_raw.explode('theme_name')
df_raw.shape

(1135953, 16)

#### Drop unwanted report formats

In [None]:
# drop documents with less than 100 words
df_raw = df_raw[df_raw['word_count']>100]
df_raw.shape

(1072040, 16)

In [None]:
#drop unwanted report formats
df_raw = df_raw[~df_raw['format'].isin(['Infographic', 'Map','Interactive'])]
print(df_raw.shape)

(1072040, 16)


#### Filter df by text_len

In [None]:
#remove reports with less than text len < 100
df_raw['len_text'] = df_raw['text'].str.len() #apply lambda to count words only
df_raw = df_raw[df_raw['len_text']>100]
df_raw.shape

(1072040, 17)

#### pickle raw data

In [None]:
# df_raw.to_pickle('..\data\RW_Themes_Data_Raw_Full')

#### Reduce df and select fields needed

In [None]:
# keep only text and theme_name columns
df = df_raw[['report_id', 'text', 'theme_name']]

In [None]:
#checking df shape and number of reports represented
df_shape = df.shape
print(f'Shape of the Dataframe (multiple rows per unique report) = {df_shape[0]} rows and {df_shape[1]} columns')
report_number = len(df['report_id'].unique())
print(f'Number of unique ReliefWeb reports retrieved = {report_number}')


Shape of the Dataframe (multiple rows per unique report) = 1072040 rows and 3 columns
Number of unique ReliefWeb reports retrieved = 433530


#### Reduce data to keep reports with a single label/theme

In [None]:
## should we exclude docs with more than 3 labels? USe only docs with 1 label. Try multiclass
# #How many themes/labels per report
df_theme_counts = df.groupby(['report_id'])['theme_name'].size().reset_index().sort_values(by = 'theme_name', ascending = False)
single_label_reports = df_theme_counts['report_id'][df_theme_counts['theme_name']==1]
print(f"Number of reports with a single label/theme = {len(single_label_reports.unique())}")
#filter raw_df
single_label_data = df_raw[df_raw['report_id'].isin(single_label_reports)]
single_label_data_theme_counts = single_label_data.groupby(['report_id'])['theme_name'].size().reset_index().sort_values(
    by = 'theme_name', ascending = False)
single_label_data_theme_counts['theme_name'].value_counts()


Number of reports with a single label/theme = 181469


theme_name
1    181469
Name: count, dtype: int64

In [None]:
single_label_data.shape

(181469, 17)

#### Pickle Single Label df

In [None]:
# single_label_data.to_pickle('..\data\RW_Themes_Single_Label_Reports_Data')

In [None]:
df_theme_counts['theme_name'].value_counts()

theme_name
1     181469
2     111728
3      48749
4      30300
5      20782
6      14991
7      11018
8       7203
9       3877
10      2073
11       840
12       352
13       101
14        44
15         2
16         1
Name: count, dtype: int64

#### Theme Counts


In [None]:
def df_theme_counts(df, theme_cols):
    #Count of unique themes
    #Creating a Counter class object using list as an iterable data container
    theme_names = df[theme_cols]
    theme_counts = Counter(theme_names)

    print(f'There are {len(theme_counts.keys())} Themes represented in this dataset.\nThey are: \n{list((theme_counts.keys()))}')
    print(f'\n')
    print(f'Counts of each unique "theme" in this dataset:\n{sorted(dict(theme_counts).items(), key = lambda x:x[1], reverse=True)}')

In [None]:
# themes counts in single_label_data
df_theme_counts(single_label_data, 'theme_name')

There are 20 Themes represented in this dataset.
They are: 
['Agriculture', 'Disaster Management', 'Coordination', 'Peacekeeping and Peacebuilding', 'Protection and Human Rights', 'HIV/Aids', 'Education', 'Food and Nutrition', 'Mine Action', 'Camp Coordination and Camp Management', 'Contributions', 'Shelter and Non-Food Items', 'Recovery and Reconstruction', 'Health', 'Water Sanitation Hygiene', 'Safety and Security', 'Logistics and Telecommunications', 'Humanitarian Financing', 'Climate Change and Environment', 'Gender']


Counts of each unique "theme" in this dataset:
[('Protection and Human Rights', 62596), ('Health', 36493), ('Peacekeeping and Peacebuilding', 11404), ('Recovery and Reconstruction', 9524), ('Food and Nutrition', 8766), ('Shelter and Non-Food Items', 8387), ('Contributions', 7628), ('Agriculture', 6581), ('Water Sanitation Hygiene', 5306), ('Disaster Management', 5189), ('Coordination', 4755), ('Education', 4750), ('Safety and Security', 3326), ('Logistics and Teleco

In [None]:
#theme counts in multilabel data

df_theme_counts(df, 'theme_name')


There are 20 Themes represented in this dataset.
They are: 
['Agriculture', 'Disaster Management', 'Peacekeeping and Peacebuilding', 'Protection and Human Rights', 'Safety and Security', 'Coordination', 'Education', 'Health', 'Gender', 'Recovery and Reconstruction', 'HIV/Aids', 'Food and Nutrition', 'Shelter and Non-Food Items', 'Water Sanitation Hygiene', 'Logistics and Telecommunications', 'Climate Change and Environment', 'Mine Action', 'Camp Coordination and Camp Management', 'Contributions', 'Humanitarian Financing']


Counts of each unique "theme" in this dataset:
[('Protection and Human Rights', 173956), ('Health', 163359), ('Food and Nutrition', 139368), ('Water Sanitation Hygiene', 98639), ('Shelter and Non-Food Items', 91426), ('Agriculture', 65766), ('Education', 59395), ('Contributions', 53504), ('Coordination', 49565), ('Recovery and Reconstruction', 48059), ('Peacekeeping and Peacebuilding', 31008), ('Logistics and Telecommunications', 23386), ('Disaster Management', 1819