<a href="https://colab.research.google.com/github/esnue/ThesisAllocationSystem/blob/main/preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Preparation**

As part of the workflow between GitHub and Google Colab, please follow these steps: 
1. Upload the [data](https://drive.google.com/drive/folders/1ExS7M2OOkbYS5Z5O9pbPbaCpSa0rhGet?usp=sharing) to a folder in your GDrive. 
2. Mount your GDrive.
3. Set the data folder as your present working directory. 

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pwd
%cd /content/drive/MyDrive/ThesisAllocationSystem

/content
/content/drive/MyDrive/ThesisAllocationSystem


# **Convert PDF to TXT**

Convert all PDF files in the current working directory to TXT files.

In [None]:
!pip install tika

In [None]:
import os
from tika import parser 
import re

def read_pdf(pdf_file):

    text = parser.from_file(pdf_file)['content']
    non_bytes = text.encode().decode()
    no_space = non_bytes.strip()
    final = no_space.strip('\n')
    return final.encode("latin-1","ignore")

def pdf_to_txt(folder_with_pdf, dest_folder):
    pdf_files = []

    for root, dirs, files in os.walk(folder_with_pdf):
        for f in files:
            if '.pdf' in f:
                pdf_files.append(os.path.join(root, f))
    #print(pdf_files)

    for file_ in pdf_files:
        text_file = os.path.splitext(os.path.basename(file_))[0]+'.txt'
        with open(os.path.join(dest_folder,text_file), 'wb') as text_f:
            text_f.write(read_pdf(file_))

    return None

In [None]:
pdf_to_txt('./supervisors', './supervisors-txt') 

In [None]:
# Warning: This will run a couple minutes
pdf_to_txt('./train-papers', './train-papers-txt') 

In [None]:
pdf_to_txt('./test-theses', './test-theses-txt') 

In [None]:
pdf_to_txt('./test-proposals', './test-proposals-txt') 

# **Put TXT files into CSV**

After importing the packages, define the directory of interest and run the function below to create a CSV files that entails all TXT files in the following structure: character values in columns `FileName` and `Content`.

In [5]:
from glob import glob
import pandas as pd

In [None]:
def txt_to_csv(input_dir, output_dir, new_filename): 
  
  files = glob('/content/drive/MyDrive/ThesisAllocationSystem/' + input_dir + '/*.txt')
  data = [[i, open(i, 'rb').read()] for i in files]
  df = pd.DataFrame(data, columns = ['FileName', 'Content'])
  df['FileName'] = df['FileName'].str.replace('/content/drive/MyDrive/ThesisAllocationSystem/' + input_dir + '/', '')
  df['Content'] = df['Content'].str.slice(start = 0, stop = 32767) # Upper limit of strings per cell in csv
  df.to_csv(output_dir + '/' + new_filename + '.csv', index = False)
  if not df.empty: 
    print('Succesfully converted txt files in directory ' + os.path.basename('/content/drive/MyDrive/ThesisAllocationSystem/' + input_dir + ' to single csv file.'))
  else: 
    print('File empty.') 
  return None

In [None]:
# Warning: This will take a couple minutes
txt_to_csv('train-papers-txt', 'data_final', 'train-papers-final')

Succesfully converted txt files in directory train-papers-txt to single csv file.


In [None]:
txt_to_csv('test-theses-txt', 'data_final', 'test-theses-final')

Succesfully converted txt files in directory test-theses-txt to single csv file.


In [None]:
txt_to_csv('test-proposals-txt', 'data_final', 'test-proposals-final')

Succesfully converted txt files in directory test-proposals-txt to single csv file.


In [None]:
txt_to_csv('supervisors-txt', 'data_final', 'supervisors-final')

Succesfully converted txt files in directory supervisors-txt to single csv file.


# **Data Labelling: Train**

We manually define a dictionary containing a categorical label for each professor, broadly describing their area of research. Thereafter, we integrate these labels into the existing train dataset.

In [6]:
import numpy as np

# creating prof/research categorical label
domain_dict = {'anheier': 'non_profit',
              'bryson': 'technology_governance',
              'cis': 'international_security',
              'cali': 'international_law',
              'cingolani': 'development_studies',              
              'costello': 'migration_law',
              'clachsland': 'climate_sustainability',
              'graf': 'education',
              'hallerberg': 'fiscal_governance',
              'hammerschmid': 'public_management',
              'hassel': 'labour_policy',
              'hirth': 'energy_economics',
              'hustedt': 'public_administration',
              'iacovone': 'development_economics',
              'jachtenfuchs': 'european_governance',
              'jankin': 'data_science',
              'kayser': 'comparative_politics',
              'kreyenfeld': 'social_policy',
              'mair': 'strategic_management',
              'mena': 'organisational_management',              
              'mungiu-pippidi': 'democracy_studies',
              'munzert': 'political_behaviour',
              'patz': 'international_organizations',
              'reh': 'european_politics',
              'roemmele': 'political_communication',
               'shaikh': 'health_economics',
               'snower': 'macroeconomics',
               'stockmann': 'digital_governance',
               'traxler': 'taxation',
               'wegrich': 'policy_process'

}

In [7]:
# Load train data
data = pd.read_csv('/content/drive/MyDrive/ThesisAllocationSystem/data_final/train-papers-final.csv', encoding = 'latin1')

# Remove FileNames from txt ending
data["FileName"] = data["FileName"].str.replace('.txt$', '').str.replace('\d+', '').str.lower().str.replace('\W+', '')

print(data.sample(10))

          FileName                                            Content
108         bryson  b'Extended Ramp Goal Module: Low-Cost Behaviou...
611           mair  b'Entrepreneurship as a Platform for Pursuing ...
205     hallerberg  b'Diego A. Salazar-Morales and Mark Hallerberg...
559  mungiupippidi  b'The Quest for Good Governance: Learning from...
679          hirth  b"Technology-neutral auc-\n\ntions for renewab...
158       costello  b'The European Union as an Area of Freedom, Se...
44          jankin  b'Kenneth Benoit, Drew Conway, Benjamin E. Lau...
208     hallerberg  b'The Role of  Fiscal Coordination and Partisa...
123           cali  b'Hard Protection through Soft Courts? Non-Ref...
99          bryson  b'The Making of the EPSRC Principles of Roboti...


In [8]:
# Create a domain column to facilitate mapping on dictionary keys and pass labels as value
data["domain"] = data["FileName"].map(domain_dict)
print(data)

         FileName  ...             domain
0    hammerschmid  ...  public_management
1    hammerschmid  ...  public_management
2    hammerschmid  ...  public_management
3    hammerschmid  ...  public_management
4    hammerschmid  ...  public_management
..            ...  ...                ...
806       wegrich  ...     policy_process
807       wegrich  ...     policy_process
808       wegrich  ...     policy_process
809       wegrich  ...     policy_process
810       wegrich  ...     policy_process

[811 rows x 3 columns]


In [9]:
# Create binary dummy one-hot encoder for each research domain label
dum_df = pd.get_dummies(data, columns=["domain"])
type(dum_df['domain_comparative_politics'].iloc[1])

numpy.uint8

In [10]:
# concate the two dataframes 
data = pd.concat([data.iloc[:,:2], dum_df.iloc[:,2:]], axis = 1)

# Extract label
dat_label = data.drop_duplicates('FileName')

In [11]:
data.drop(['FileName'], inplace=True, axis=1)

In [14]:
train_df = pd.DataFrame(data)
train_df['content'] = data['Content']
train_df['labels'] = data.iloc[:, 1:].values.tolist()

print(train_df.sample(10))

# type(train_df['labels'].iloc[1])
# label = train_df['labels'].iloc[1]
# type(label[1])

                                               Content  ...                                             labels
573  b'Europe Moves Eastward: Beyond the New Border...  ...  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
494  b'Microsoft Word - Iacovone.sg1 (003).docx\n\n...  ...  [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
397  b"Juncker's Curse? Identity, Interest, and Pub...  ...  [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...
720  b'Microsoft Word - Working paper_Domestic viol...  ...  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...
764  b'Development of legal expertise\n\nAndreas Gl...  ...  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
583  b'#2013-052 \n\xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \x...  ...  [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
395  b'654\n\n43. Institutional work and (ir)respon...  ...  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
670  b'1 Mercator Research Institute on Global Comm...  ...  [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...
6

In [15]:
# Save labeled dataframe as csv 
train_df.to_csv('/content/drive/MyDrive/ThesisAllocationSystem/data_final/train-papers-label.csv', index = False)

# **Extract labelled data**

In [None]:
dat_label.drop(['Content'], inplace = True, axis = 1)

In [None]:
label_df = pd.DataFrame(dat_label)
label_df['labels'] = label_df.iloc[:, 1:].values.tolist()
print(label_df.sample(5))

          FileName  ...                                             labels
39          jankin  ...  [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
764        traxler  ...  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
546  mungiupippidi  ...  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
288         hassel  ...  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...
593           mair  ...  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...

[5 rows x 30 columns]


# **Data Labelling: Test**



In this section, we assign the newly created labels to student thesis proposals, either referring to their first or second preference. The finished data set will serve as a validation/test dataset.

In [None]:
# Load test data
data_test = pd.read_csv('/content/drive/MyDrive/ThesisAllocationSystem/data_final/test-proposals-final.csv', encoding = 'latin1')

In [None]:
# creating prof/research categorical label
domain_dict2 = {'thesisproposal1': 'munzert',
                'thesisproposal2': 'traxler',
                'thesisproposal3': 'bryson',
                'thesisproposal4': 'shaikh',
                'thesisproposal5': 'munzert',
                'thesisproposal6': 'iacovone'
}

In [None]:
# Clean file names
data_test["FileName"] = data_test["FileName"].str.replace(r'.txt$', '').str.lower()

# Add new column: domain
data_test["FileName"] = data_test["FileName"].map(domain_dict2)
data_test['content'] = data_test['Content']

# Merge with data label
data_test_final = pd.merge(data_test, label_df, on='FileName')

# Swap content and labels
col_list = list(data_test_final)
col_list[-1], col_list[-2] = col_list[-2], col_list[-1]
data_test_final.columns = col_list

Unnamed: 0,Content,domain_comparative_politics,domain_data_science,domain_development_economics,domain_development_studies,domain_digital_governance,domain_education,domain_energy_economics,domain_european_governance,domain_european_politics,domain_fiscal_governance,domain_health_economics,domain_international_law,domain_international_organizations,domain_international_security,domain_labour_policy,domain_macroeconomics,domain_migration_law,domain_non_profit,domain_organisational_management,domain_policy_process,domain_political_behaviour,domain_political_communication,domain_public_administration,domain_public_management,domain_social_policy,domain_strategic_management,domain_taxation,domain_technology_governance
0,b'Anabel Berj\xf3n S\xe1nchez \n\n \n\nPROPOSA...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
1,b'Master_Thesis_Proposal\n\n\nMaster Thesis Pr...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
2,b'Master_Thesis_Proposal\n\n\nMaster Thesis Pr...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
3,"b""New Thesis Proposal Form \n\nAY 2019-2020 \n...",0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,b'Thesis Proposal \n\nCitizen Perceptions and ...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
5,b'New Thesis Proposal Form \n\nAY 2020-2021 \n...,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [None]:
# Save df
data_test_final.to_csv('/content/drive/MyDrive/ThesisAllocationSystem/data_final/test-proposals-label.csv', index = False)