<a href="https://colab.research.google.com/github/gylam/siads696-sprsum2024-team05/blob/main/RW_Themes_data_manipulation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from collections import Counter

#### Read in data

In [None]:
data_path = '..\data\gl_full_pickle' #RW_UN_themes_data_pickle' #explode themes if not on indivdual rows
df_raw = pd.read_pickle(data_path)
print(df_raw.shape)


FileNotFoundError: [Errno 2] No such file or directory: '..\\data\\gl_full_pickle'

#### Drop unwanted report formats

In [None]:
#drop unwanted report formats
df_raw = df_raw[~df_raw['format'].isin(['Infographic', 'Map','Interactive'])]
print(df_raw.shape)

(1061984, 14)


#### Filter df by text_len

In [None]:
#remove reports with less than text len < 100
df_raw['len_text'] = df_raw['text'].str.len() #apply lambda to count words only
df_raw = df_raw[df_raw['len_text']>100]
df_raw.shape

(1060057, 15)

#### Reduce df and select fields needed

In [None]:
# keep only text and theme_name columns
df = df_raw[['report_id', 'text', 'theme_name']]

In [None]:
#checking df shape and number of reports represented
df_shape = df.shape
print(f'Shape of the Dataframe (multiple rows per unique report) = {df_shape[0]} rows and {df_shape[1]} columns')
report_number = len(df['report_id'].unique())
print(f'Number of unique ReliefWeb reports retrieved = {report_number}')


Shape of the Dataframe (multiple rows per unique report) = 1060057 rows and 3 columns
Number of unique ReliefWeb reports retrieved = 428424


In [None]:
#How many themes/labels per report
df_theme_counts = df.groupby(['report_id'])['theme_name'].size().reset_index().sort_values(by = 'theme_name', ascending = False)
df_theme_counts['theme_name'].value_counts()

## should we exclude docs with more than 3 labels? USe only docs with 1 label. Try multiclass


theme_name
1     180197
2     110165
3      47669
4      29552
5      20376
6      14891
7      10805
8       7180
9       3996
10      2172
11       910
12       364
13       108
14        37
15         2
Name: count, dtype: int64

In [None]:
df_theme_counts.tail(5)

Unnamed: 0,report_id,theme_name
130736,523929,1
284843,2820999,1
130729,523915,1
130726,523905,1
0,18248,1


In [None]:
#printing counts

def df_theme_counts(df, theme_cols):
    #Count of unique themes
    #Creating a Counter class object using list as an iterable data container
    theme_names = df[theme_cols]
    theme_counts = Counter(theme_names)

    print(f'There are {len(theme_counts.keys())} Themes represented in this dataset.\nThey are: \n{list((theme_counts.keys()))}')
    print(f'\n')
    print(f'Counts of each unique "theme" in this dataset:\n{sorted(dict(theme_counts).items(), key = lambda x:x[1], reverse=True)}')

df_theme_counts(df, 'theme_name')


There are 20 Themes represented in this dataset.
They are: 
['Agriculture', 'Camp Coordination and Camp Management', 'Coordination', 'Shelter and Non-Food Items', 'Climate Change and Environment', 'Education', 'Food and Nutrition', 'Gender', 'Health', 'HIV/Aids', 'Protection and Human Rights', 'Water Sanitation Hygiene', 'Recovery and Reconstruction', 'Peacekeeping and Peacebuilding', 'Contributions', 'Logistics and Telecommunications', 'Mine Action', 'Disaster Management', 'Safety and Security', 'Humanitarian Financing']


Counts of each unique "theme" in this dataset:
[('Protection and Human Rights', 170312), ('Health', 160698), ('Food and Nutrition', 137934), ('Water Sanitation Hygiene', 96650), ('Shelter and Non-Food Items', 90556), ('Agriculture', 65471), ('Education', 58370), ('Contributions', 53546), ('Coordination', 50489), ('Recovery and Reconstruction', 47706), ('Peacekeeping and Peacebuilding', 30105), ('Logistics and Telecommunications', 23472), ('Disaster Management', 1865

#### Balance the dataset

In [None]:
#Theme List
theme_list= list(df['theme_name'].unique())

def balance_dataset(df, documents_per_label):
  df_balanced = pd.DataFrame(columns = list(df.columns))

  for theme in theme_list:
    try:
      df_theme = df[df['theme_name'] == theme].sample(documents_per_label)
    except:
      df_theme = df[df['theme_name'] == theme]
    df_balanced = pd.concat([df_balanced, df_theme])
  return df_balanced

df_balanced= balance_dataset(df, 6000)
df_balanced.shape

(114767, 3)

In [None]:
df_theme_counts(df_balanced, 'theme_name')

There are 20 Themes represented in this dataset.
They are: 
['Agriculture', 'Camp Coordination and Camp Management', 'Coordination', 'Shelter and Non-Food Items', 'Climate Change and Environment', 'Education', 'Food and Nutrition', 'Gender', 'Health', 'HIV/Aids', 'Protection and Human Rights', 'Water Sanitation Hygiene', 'Recovery and Reconstruction', 'Peacekeeping and Peacebuilding', 'Contributions', 'Logistics and Telecommunications', 'Mine Action', 'Disaster Management', 'Safety and Security', 'Humanitarian Financing']


Counts of each unique "theme" in this dataset:
[('Agriculture', 6000), ('Camp Coordination and Camp Management', 6000), ('Coordination', 6000), ('Shelter and Non-Food Items', 6000), ('Climate Change and Environment', 6000), ('Education', 6000), ('Food and Nutrition', 6000), ('Gender', 6000), ('Health', 6000), ('Protection and Human Rights', 6000), ('Water Sanitation Hygiene', 6000), ('Recovery and Reconstruction', 6000), ('Peacekeeping and Peacebuilding', 6000), ('C

#### One Hot Encoding

In [None]:
#One hot encode df
one_hot_encoded_data = pd.get_dummies(df_balanced, columns = ['theme_name'], dtype=int )
one_hot_encoded_data=one_hot_encoded_data.rename(columns={'report_id': 'theme_name_report_id',
                                                          'text': 'theme_name_text' })
one_hot_encoded_data = one_hot_encoded_data.rename(columns=lambda x: x[11:])
print(one_hot_encoded_data.shape)
one_hot_encoded_data.head(2)

#Keeps single row for unique report_id
one_hot_encoded_data = one_hot_encoded_data.groupby(['report_id', 'text'])[theme_list].max().reset_index()
print(one_hot_encoded_data.shape)
one_hot_encoded_data.head(2)

(114767, 22)
(97699, 22)


Unnamed: 0,report_id,text,Agriculture,Camp Coordination and Camp Management,Coordination,Shelter and Non-Food Items,Climate Change and Environment,Education,Food and Nutrition,Gender,...,Protection and Human Rights,Water Sanitation Hygiene,Recovery and Reconstruction,Peacekeeping and Peacebuilding,Contributions,Logistics and Telecommunications,Mine Action,Disaster Management,Safety and Security,Humanitarian Financing
0,18248,**Flood Outlook**\n\n- The Brahmaputra-Jamuna ...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,21561,"<font size=1 face=""Arial"">Exploring the Tajiki...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [None]:
#checking the balance in the themes
one_hot_theme_count = one_hot_encoded_data[one_hot_encoded_data.columns[2:]].sum().reset_index()
one_hot_theme_count = one_hot_theme_count.rename(columns={'index':'theme', 0:'counts'})
one_hot_theme_count.sort_values(by = 'counts', ascending = False)

Unnamed: 0,theme,counts
0,Agriculture,6000
1,Camp Coordination and Camp Management,6000
18,Safety and Security,6000
17,Disaster Management,6000
16,Mine Action,6000
15,Logistics and Telecommunications,6000
14,Contributions,6000
13,Peacekeeping and Peacebuilding,6000
12,Recovery and Reconstruction,6000
11,Water Sanitation Hygiene,6000


#### Save balanced and one hot encoded df to csv

In [None]:
one_hot_encoded_data.to_csv(r'../data/rw_themes_balanced_df.csv')