<a href="https://colab.research.google.com/github/esnue/ThesisAllocationSystem/blob/main/label.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Preparation**

As part of the workflow between GitHub and Google Colab, please follow these steps: 
1. Upload the [data](https://drive.google.com/drive/folders/1ExS7M2OOkbYS5Z5O9pbPbaCpSa0rhGet?usp=sharing) to a folder in your GDrive. 
2. Mount your GDrive.
3. Set the data folder as your present working directory. 

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pwd
%cd /content/drive/MyDrive/ThesisAllocationSystem

/content
/content/drive/MyDrive/ThesisAllocationSystem


# **Data Labelling: Train**

We manually define a dictionary containing a categorical label for each professor, broadly describing their area of research. Thereafter, we integrate these labels into the existing train dataset.

In [None]:
import numpy as np

# creating prof/research categorical label
domain_dict = {'anheier': 'non_profit',
              'bryson': 'technology_governance',
              'cis': 'international_security',
              'cali': 'international_law',
              'cingolani': 'development_studies',              
              'costello': 'migration_law',
              'clachsland': 'climate_sustainability',
              'graf': 'education',
              'hallerberg': 'fiscal_governance',
              'hammerschmid': 'public_management',
              'hassel': 'labour_policy',
              'hirth': 'energy_economics',
              'hustedt': 'public_administration',
              'iacovone': 'development_economics',
              'jachtenfuchs': 'european_governance',
              'jankin': 'data_science',
              'kayser': 'comparative_politics',
              'kreyenfeld': 'social_policy',
              'mair': 'strategic_management',
              'mena': 'organisational_management',              
              'mungiu-pippidi': 'democracy_studies',
              'munzert': 'political_behaviour',
              'patz': 'international_organizations',
              'reh': 'european_politics',
              'roemmele': 'political_communication',
               'shaikh': 'health_economics',
               'snower': 'macroeconomics',
               'stockmann': 'digital_governance',
               'traxler': 'taxation',
               'wegrich': 'policy_process'

}

In [None]:
# Load train data
data = pd.read_csv('/content/drive/MyDrive/ThesisAllocationSystem/data_final/train-papers-final.csv', encoding = 'latin1')

# Remove FileNames from txt ending
data["FileName"] = data["FileName"].str.replace('.txt$', '').str.replace('\d+', '').str.lower()

print(data.sample(10))

     FileName                                            Content
459       cis  b'ETHNICITY, THE STATE, AND THE DURATION OF CI...
493  iacovone  b'World Bank Document\n\n\nPolicy Research Wor...
651   munzert  b'The conventional wisdom about tactical votin...
446       cis  b'Tracking and promoting the usage of a COVID-...
244      graf  b'OP-SOCO160040 185..206\n\n\nArticle\n\nCombi...
48     jankin  b'Economic voting in a crisis_ latest\n\n\nEco...
672     hirth  b'Notes\n\n\nIntegration Costs Revisited   \nA...
522  iacovone  b'World Bank Document\n\n\nPolicy Research Wor...
617      mair  b'Market Meets Community: Institutional Logics...
619      mair  b'Inhabited Actors: Internalizing Institutions...


In [None]:
# Create a domain column to facilitate mapping on dictionary keys and pass labels as value
data["domain"] = data["FileName"].map(domain_dict)
print(data)

         FileName  ...             domain
0    hammerschmid  ...  public_management
1    hammerschmid  ...  public_management
2    hammerschmid  ...  public_management
3    hammerschmid  ...  public_management
4    hammerschmid  ...  public_management
..            ...  ...                ...
806       wegrich  ...     policy_process
807       wegrich  ...     policy_process
808       wegrich  ...     policy_process
809       wegrich  ...     policy_process
810       wegrich  ...     policy_process

[811 rows x 3 columns]


In [None]:
# Create binary dummy one-hot encoder for each research domain label
dum_df = pd.get_dummies(data, columns=["domain"])
type(dum_df['domain_comparative_politics'].iloc[1])

numpy.uint8

In [None]:
# concate the two dataframes 
data = pd.concat([data.iloc[:,:2], dum_df.iloc[:,2:]], axis = 1)

# Extract label
#dat_label = data.drop_duplicates('FileName')

Unnamed: 0,FileName,Content,domain_comparative_politics,domain_data_science,domain_democracy_studies,domain_development_economics,domain_development_studies,domain_digital_governance,domain_education,domain_energy_economics,domain_european_governance,domain_european_politics,domain_fiscal_governance,domain_health_economics,domain_international_law,domain_international_organizations,domain_international_security,domain_labour_policy,domain_macroeconomics,domain_migration_law,domain_non_profit,domain_organisational_management,domain_policy_process,domain_political_behaviour,domain_political_communication,domain_public_administration,domain_public_management,domain_social_policy,domain_strategic_management,domain_taxation,domain_technology_governance
0,hammerschmid,"b'1 \n \n\nCurry, D., Hammerschmid, G., Jilke,...",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
1,hammerschmid,b'2007 EGPA_paper1109.doc\n\n\nSee discussions...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
2,hammerschmid,b'The Governance of Infrastructure \n\n \n\nEd...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
3,hammerschmid,"b""COCOPS Working Paper no. 1\n\n\nCoordinating...",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
4,hammerschmid,b'Administrative tradition and management refo...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
806,wegrich,b'S0017257X20000160jra 1..21\n\n\nARTICLE\n\nT...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
807,wegrich,b'0002855189 1..18\n\n\nComp. by: Bendict Rich...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
808,wegrich,b'Crowdsourcing and regulatory reviews: A new ...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
809,wegrich,b'ACCOMMODATING A FOREIGN OBJECT\n\n\nThis art...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0


In [None]:
data.drop(['FileName'], inplace=True, axis=1)

In [None]:
train_df = pd.DataFrame()
train_df['content'] = data['Content']
train_df['labels'] = data.iloc[:, 1:].values.tolist()

# Check type and content
print(train_df.sample(10))
print(type(train_df['labels'].iloc[1]))
print(type(train_df['labels'].iloc[1][1]))
print(train_df.shape)

                                               content                                             labels
169  b"ISBN 978-94-6138-478-2 \n\nAvailable for fre...  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
556  b'The European Journal of  International Law V...  [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
151  b'\xa9 The Author(s) (2019). Published by Oxfo...  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
638  b"1002 Initial Cover.pdf\n\n\neconstor\nMake Y...  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...
420  b'Coalition Prospects and Policy Change: An Ap...  [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
318  b"Striking Deals: Concertation in the Reform o...  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
602  b"Capturing the dynamics of the sharing econom...  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
424  b'The Media, the Economy and the Vote \n\nMark...  [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
521  b"Microsoft Word - DFID-RTA final report 

In [None]:
# Save labeled dataframe as csv 
train_df.to_csv('/content/drive/MyDrive/ThesisAllocationSystem/data_final/train-papers-label.csv', index = False)

# **Extract labelled data**

In [None]:
dat_label.drop(['Content'], inplace = True, axis = 1)

In [None]:
dat_label['labels'] = dat_label.iloc[:, 1:].values.tolist()

In [None]:
label_df = pd.DataFrame()
label_df['FileName'] = dat_label['FileName']
label_df['labels'] = dat_label['labels']

# Check type
print(type(label_df))
print(type(label_df['labels'].iloc[1]))
print(type(label_df['labels'].iloc[1][1]))

<class 'int'>


# **Data Labelling: Test**



In this section, we assign the newly created labels to student thesis proposals, either referring to their first or second preference. The finished data set will serve as a validation/test dataset.

In [None]:
# Load test data
data_test = pd.read_csv('/content/drive/MyDrive/ThesisAllocationSystem/data_final/test-proposals-final.csv', encoding = 'latin1')

In [None]:
# creating prof/research categorical label
domain_dict2 = {'thesisproposal1': 'munzert',
                'thesisproposal2': 'traxler',
                'thesisproposal3': 'bryson',
                'thesisproposal4': 'shaikh',
                'thesisproposal5': 'munzert',
                'thesisproposal6': 'iacovone'
}

In [None]:
# Clean file names
data_test["FileName"] = data_test["FileName"].str.replace(r'.txt$', '').str.lower()

# Add new column: domain
data_test["FileName"] = data_test["FileName"].map(domain_dict2)

# Merge with data label
test_df = pd.merge(data_test, label_df, on='FileName')
test_df['content'] = data_test['Content']

# Remove non-necessary col
test_df.drop(['FileName', 'Content'], inplace = True, axis = 1)

# Swap content and labels
cols = list(test_df.columns)
a, b = cols.index('labels'), cols.index('content')
cols[b], cols[a] = cols[a], cols[b]
test_df = test_df[cols]

# Check type and content
test_df.shape
print(type(test_df))
print(type(test_df['labels'].iloc[1]))
print(type(test_df['labels'].iloc[1][1]))

Unnamed: 0,content,labels
0,b'Anabel Berj\xf3n S\xe1nchez \n\n \n\nPROPOSA...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,b'Master_Thesis_Proposal\n\n\nMaster Thesis Pr...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,"b""New Thesis Proposal Form \n\nAY 2019-2020 \n...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,b'Master_Thesis_Proposal\n\n\nMaster Thesis Pr...,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,b'Thesis Proposal \n\nCitizen Perceptions and ...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
5,b'New Thesis Proposal Form \n\nAY 2020-2021 \n...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ..."


In [None]:
# Save df
test_df.to_csv('/content/drive/MyDrive/ThesisAllocationSystem/data_final/test-proposals-label.csv', index = False)