<a href="https://colab.research.google.com/github/esnue/ThesisAllocationSystem/blob/main/preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Preparation

As part of the workflow between GitHub and Google Colab, please follow these steps: 
1. Upload the data to a folder in your GDrive. 
2. Mount your GDrive.
3. Set the data folder as your present working directory. 

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pwd
%cd /content/drive/MyDrive/ThesisAllocationSystem

# Convert PDF to TXT

Convert all PDF files in the current working directory to TXT files.

In [None]:
!pip install tika

In [None]:
import os
from tika import parser

def read_pdf(pdf_file):

    text = parser.from_file(pdf_file)['content']
    return text.encode("latin-1","ignore")

def pdf_to_txt(folder_with_pdf, dest_folder):
    pdf_files = []

    for root, dirs, files in os.walk(folder_with_pdf):
        for f in files:
            if '.pdf' in f:
                pdf_files.append(os.path.join(root, f))
    #print(pdf_files)

    for file_ in pdf_files:
        text_file = os.path.splitext(os.path.basename(file_))[0]+'.txt'
        with open(os.path.join(dest_folder,text_file), 'wb') as text_f:
            text_f.write(read_pdf(file_))

    return None

In [None]:
pdf_to_txt('./supervisors', './supervisors-txt') 

In [None]:
pdf_to_txt('./train-papers', './train-papers-txt') 

In [None]:
pdf_to_txt('./raw dat', './output') 

In [None]:
pdf_to_txt('./raw dat', './output') 

In [None]:
pdf_to_txt('./raw dat', './output') 

# Put TXT files into CSV

After importing the packages, define the directory of interest and run the function below to create a CSV files that entails all TXT files in the following structure: 

Filename | Content 

In [None]:
import csv
from pathlib import Path

In [None]:
def txt_to_csv(x): 

    os.chdir('/content/drive/MyDrive/ThesisAllocationSystem/' + x)

    with open(x + '.csv', 'w', encoding = 'Latin-1') as out_file:
        csv_out = csv.writer(out_file)
        csv_out.writerow(['FileName', 'Content'])
        for fileName in Path('.').glob('*.txt'):
            lines = [ ]
            with open(str(fileName.absolute()),'rb') as one_text:
                for line in one_text.readlines():
                    lines.append(line.decode(encoding='Latin-1',errors='ignore').strip())
            csv_out.writerow([str(fileName),' '.join(lines)])

In [None]:
txt_to_csv('train-papers-txt')

In [None]:
txt_to_csv('test-theses')

In [None]:
txt_to_csv('test-proposals')

In [None]:
txt_to_csv('supervisors')

## **Data Labelling**

We manually define a dictionary containing a categorical label for each professor broadly describing their area of research. 

In [3]:
import pandas as pd
import numpy as np

# creating prof/research categorical label
domain_dict = {'Anheier': 'non_profit',
              'Bryson': 'technology_governance',
              'CIS': 'international_security',
              'Cali': 'international_law',
              'Cingolani': 'development_studies',              
              'Costello': 'migration_law',
              'Flachsland': 'climate_sustainability',
              'Graf': 'education',
              'Hallerberg': 'fiscal_governance',
              'Hammerschmid': 'public_management',
              'Hassel': 'labour_policy',
              'Hirth': 'energy_economics',
              'Hustedt': 'public_administration',
              'Iacovone': 'development_economics',
              'Jachtenfuchs': 'european_governance',
              'Jankin': 'data_science',
              'Kayser': 'comparative_politics',
              'Kreyenfeld': 'social_policy',
              'Mair': 'strategic_management',
              'Mena': 'organisational_management',              
              'Mungiu-Pippidi': 'democracy_studies',
              'Munzert': 'political_behaviour',
              'Patz': 'international_organizations',
              'Reh': 'european_politics',
              'Roemmele': 'political_communication'                         
}


In [8]:
#load training data frame 
train_df = pd.read_csv('/content/drive/MyDrive/ThesisAllocationSystem/train-papers-txt/train-papers-txt.csv', encoding='latin1')

#clean FileNames from txt ending
train_df["FileName"] = train_df["FileName"].str.replace(r'.txt$', '').str.replace(r'\d+', '')

print(train_df)

         FileName                                            Content
0    Hammerschmid                                                ...
1    Hammerschmid                                         2007 EG...
2    Hammerschmid                                                ...
3    Hammerschmid                                               C...
4    Hammerschmid                                                ...
..            ...                                                ...
377    Kreyenfeld                                                ...
378        Bryson                                            Gaud...
379        Bryson                                                ...
380        Bryson                                         1240027...
381        Bryson                                                ...

[382 rows x 2 columns]


In [10]:
#create a domain column to facilitate mapping on dictionary keys and pass labels as value
train_df["domain"] = train_df["FileName"].map(domain_dict)

print(train_df)

         FileName  ...                 domain
0    Hammerschmid  ...      public_management
1    Hammerschmid  ...      public_management
2    Hammerschmid  ...      public_management
3    Hammerschmid  ...      public_management
4    Hammerschmid  ...      public_management
..            ...  ...                    ...
377    Kreyenfeld  ...          social_policy
378        Bryson  ...  technology_governance
379        Bryson  ...  technology_governance
380        Bryson  ...  technology_governance
381        Bryson  ...  technology_governance

[382 rows x 3 columns]


In [11]:
#create binary dummy one-hot encoder for each research domain label
dum_df = pd.get_dummies(train_df, columns=["domain"])
dum_df

Unnamed: 0,FileName,Content,domain_climate_sustainability,domain_data_science,domain_education,domain_fiscal_governance,domain_international_law,domain_labour_policy,domain_migration_law,domain_non_profit,domain_political_communication,domain_public_administration,domain_public_management,domain_social_policy,domain_technology_governance
0,Hammerschmid,...,0,0,0,0,0,0,0,0,0,0,1,0,0
1,Hammerschmid,2007 EG...,0,0,0,0,0,0,0,0,0,0,1,0,0
2,Hammerschmid,...,0,0,0,0,0,0,0,0,0,0,1,0,0
3,Hammerschmid,C...,0,0,0,0,0,0,0,0,0,0,1,0,0
4,Hammerschmid,...,0,0,0,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
377,Kreyenfeld,...,0,0,0,0,0,0,0,0,0,0,0,1,0
378,Bryson,Gaud...,0,0,0,0,0,0,0,0,0,0,0,0,1
379,Bryson,...,0,0,0,0,0,0,0,0,0,0,0,0,1
380,Bryson,1240027...,0,0,0,0,0,0,0,0,0,0,0,0,1


In [14]:
#concate the two dataframes
train_df = pd.concat([train_df, dum_df], axis = 1)
train_df

Unnamed: 0,FileName,Content,domain,FileName.1,Content.1,domain_climate_sustainability,domain_data_science,domain_education,domain_fiscal_governance,domain_international_law,domain_labour_policy,domain_migration_law,domain_non_profit,domain_political_communication,domain_public_administration,domain_public_management,domain_social_policy,domain_technology_governance
0,Hammerschmid,...,public_management,Hammerschmid,...,0,0,0,0,0,0,0,0,0,0,1,0,0
1,Hammerschmid,2007 EG...,public_management,Hammerschmid,2007 EG...,0,0,0,0,0,0,0,0,0,0,1,0,0
2,Hammerschmid,...,public_management,Hammerschmid,...,0,0,0,0,0,0,0,0,0,0,1,0,0
3,Hammerschmid,C...,public_management,Hammerschmid,C...,0,0,0,0,0,0,0,0,0,0,1,0,0
4,Hammerschmid,...,public_management,Hammerschmid,...,0,0,0,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
377,Kreyenfeld,...,social_policy,Kreyenfeld,...,0,0,0,0,0,0,0,0,0,0,0,1,0
378,Bryson,Gaud...,technology_governance,Bryson,Gaud...,0,0,0,0,0,0,0,0,0,0,0,0,1
379,Bryson,...,technology_governance,Bryson,...,0,0,0,0,0,0,0,0,0,0,0,0,1
380,Bryson,1240027...,technology_governance,Bryson,1240027...,0,0,0,0,0,0,0,0,0,0,0,0,1
