In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv('specialisations.csv')
df.columns

Index(['Broad field', 'Narrow field', 'Detailed Field', 'Specialization Code',
       ' Specialization Name', 'Specialization Definition',
       'Code of Included Specialization ',
       'Other Specializations Included In The Definition',
       'Important Courses Under This Specialization', 'Unnamed: 9',
       'Unnamed: 10', 'Unnamed: 11', 'Unnamed: 12', 'Unnamed: 13',
       'Unnamed: 14'],
      dtype='object')

Alter to match the ISCO type titles, so code can be similar to mapping_filter made previously for taxonomies

First fill the cells in narrow, detailed, and specialation field so all fields are filled and can be pulled out by row

In [4]:
df.rename(columns={'Narrow field': 'major', 'Detailed Field': 'sub_major', 'Specialization Code': 'minor', ' Specialization Name': 'minor_label', 'Code of Included Specialization ' :'unit', 'Other Specializations Included In The Definition': 'description'}, inplace=True)

df = df.drop(df.columns[[9, 10, 11, 12, 13, 14]],axis = 1)
df = df.drop(columns=['Broad field', 'Specialization Definition', 'Important Courses Under This Specialization'])

df.insert(loc=1, column='major_label', value=np.nan)
df.insert(loc=3, column='sub_major_label', value=np.nan)
df

Unnamed: 0,major,major_label,sub_major,sub_major_label,minor,minor_label,unit,description
0,,,,,,Generic Programs And Qualifications,,
1,0.0,,,,,Generic Programs And Qualifications Not Furthe...,,
2,,,0.0,,,Generic Programs And Qualifications Not Furthe...,,
3,,,,,0.0,Generic Programs And Qualifications Not Furthe...,00000000,Generic Programs And Qualifications Not Furthe...
4,,,,,,,00000001,The Preparatory Year
...,...,...,...,...,...,...,...,...
3277,,,,,109999.0,Other Specializations In Services Not Elsewher...,10999999,Other Specializations In Services Not Elsewher...
3278,,,,,,Field Unknown,,
3279,999.0,,,,,Field Unknown,,
3280,,,9999.0,,,Field Unknown,,


Now add labels to the majors and sub_majors 

In [5]:
df[65:75]

Unnamed: 0,major,major_label,sub_major,sub_major_label,minor,minor_label,unit,description
65,,,,,9999.0,Generic Programs And Qualifications Not Elsewh...,999999.0,Generic Programs And Qualifications Not Elsewh...
66,,,,,,Education,,
67,11.0,,,,,Education,,
68,,,110.0,,,Programs In Education Not Further Defined,,
69,,,,,11000.0,Specialization In Education Not Further Defined,1100000.0,Specialization In Education Not Further Defined
70,,,,,,,1100001.0,Newly Enrolled In Education
71,,,111.0,,,Education Science,,
72,,,,,11101.0,Islamic Education,1110101.0,Islamic Education
73,,,,,,,1110102.0,Islamic And Comparative Education
74,,,,,,,,


In [6]:
df.major_label = np.where(df.major.notnull(), df.minor_label, df.major_label)
df.sub_major_label = np.where(df.sub_major.notnull(), df.minor_label, df.sub_major)
df[65:75]

Unnamed: 0,major,major_label,sub_major,sub_major_label,minor,minor_label,unit,description
65,,,,,9999.0,Generic Programs And Qualifications Not Elsewh...,999999.0,Generic Programs And Qualifications Not Elsewh...
66,,,,,,Education,,
67,11.0,Education,,,,Education,,
68,,,110.0,Programs In Education Not Further Defined,,Programs In Education Not Further Defined,,
69,,,,,11000.0,Specialization In Education Not Further Defined,1100000.0,Specialization In Education Not Further Defined
70,,,,,,,1100001.0,Newly Enrolled In Education
71,,,111.0,Education Science,,Education Science,,
72,,,,,11101.0,Islamic Education,1110101.0,Islamic Education
73,,,,,,,1110102.0,Islamic And Comparative Education
74,,,,,,,,


Fill the cells below so that each row has multiple accessible labels

In [7]:
df['major'] = df['major'].fillna(method='ffill')
df['major_label'] = df['major_label'].fillna(method='ffill')
df['sub_major'] = df['sub_major'].fillna(method='ffill')
df['sub_major_label'] = df['sub_major_label'].fillna(method='ffill')
df['minor'] = df['minor'].fillna(method='ffill')
df['minor_label'] = df['minor_label'].fillna(method='ffill')
df.sub_major_label = np.where(df.sub_major.notnull(), df.minor_label, df.sub_major)
df[65:75]

Unnamed: 0,major,major_label,sub_major,sub_major_label,minor,minor_label,unit,description
65,9.0,Generic Programs And Qualifications Not Elsewh...,99.0,Generic Programs And Qualifications Not Elsewh...,9999.0,Generic Programs And Qualifications Not Elsewh...,999999.0,Generic Programs And Qualifications Not Elsewh...
66,9.0,Generic Programs And Qualifications Not Elsewh...,99.0,Education,9999.0,Education,,
67,11.0,Education,99.0,Education,9999.0,Education,,
68,11.0,Education,110.0,Programs In Education Not Further Defined,9999.0,Programs In Education Not Further Defined,,
69,11.0,Education,110.0,Specialization In Education Not Further Defined,11000.0,Specialization In Education Not Further Defined,1100000.0,Specialization In Education Not Further Defined
70,11.0,Education,110.0,Specialization In Education Not Further Defined,11000.0,Specialization In Education Not Further Defined,1100001.0,Newly Enrolled In Education
71,11.0,Education,111.0,Education Science,11000.0,Education Science,,
72,11.0,Education,111.0,Islamic Education,11101.0,Islamic Education,1110101.0,Islamic Education
73,11.0,Education,111.0,Islamic Education,11101.0,Islamic Education,1110102.0,Islamic And Comparative Education
74,11.0,Education,111.0,Islamic Education,11101.0,Islamic Education,,


Now remove the rows that do not have an 'unit'/'description'

In [8]:
df.dropna(subset=['unit'], inplace=True)
df.to_csv('checkUpdatedCSV.csv')
df

Unnamed: 0,major,major_label,sub_major,sub_major_label,minor,minor_label,unit,description
3,0.0,Generic Programs And Qualifications Not Furthe...,0.0,Generic Programs And Qualifications Not Furthe...,0.0,Generic Programs And Qualifications Not Furthe...,00000000,Generic Programs And Qualifications Not Furthe...
4,0.0,Generic Programs And Qualifications Not Furthe...,0.0,Generic Programs And Qualifications Not Furthe...,0.0,Generic Programs And Qualifications Not Furthe...,00000001,The Preparatory Year
5,0.0,Generic Programs And Qualifications Not Furthe...,0.0,Generic Programs And Qualifications Not Furthe...,0.0,Generic Programs And Qualifications Not Furthe...,00000002,Preparatory Year - Science Programs
6,0.0,Generic Programs And Qualifications Not Furthe...,0.0,Generic Programs And Qualifications Not Furthe...,0.0,Generic Programs And Qualifications Not Furthe...,00000003,Preparatory Year- Literature Programs
7,0.0,Generic Programs And Qualifications Not Furthe...,0.0,Generic Programs And Qualifications Not Furthe...,0.0,Generic Programs And Qualifications Not Furthe...,00000004,Scientific Programs
...,...,...,...,...,...,...,...,...
3259,104.0,Transport Services,1041.0,Driving The Trains,104107.0,Driving The Trains,10410701,Driving The Trains
3265,104.0,Transport Services,1041.0,Station And Passenger Services,104108.0,Station And Passenger Services,10410801,Station And Passenger Services
3274,108.0,Inter-Disciplinary Programs And Qualifications...,1088.0,Inter-Disciplinary Programs And Qualifications...,108888.0,Inter-Disciplinary Programs And Qualifications...,10888888,Inter-Disciplinary Programs And Qualifications...
3277,109.0,Other Programs In Services Not Elsewhere Class...,1099.0,Other Specializations In Services Not Elsewher...,109999.0,Other Specializations In Services Not Elsewher...,10999999,Other Specializations In Services Not Elsewher...


Now take the comparative values from the scraped CSV (Only pull relevant cols & reorder)

In [9]:
df1 = pd.read_csv('combinedUniversityCSV/allUniversities.csv', usecols=[ 'College', 'Department', 'Disciplines'])
df1 = df1[['College', 'Department', 'Disciplines']]
df1

Unnamed: 0,College,Department,Disciplines
0,,,
1,Shariah and Islamic Studies,,
2,,Shariah,
3,,,Shariah
4,,,Shariah and Educational Preparation
...,...,...,...
7772,,,Banking Business
7773,,,Computer Programming
7774,,,Computer Networking
7775,,,Accounting


In [10]:
df1['College'] = df1['College'].fillna(method='ffill')
df1['Department'] = df1['Department'].fillna(method='ffill')
df1['Disciplines'] = df1['Disciplines'].fillna(method='ffill')
df1

Unnamed: 0,College,Department,Disciplines
0,,,
1,Shariah and Islamic Studies,,
2,Shariah and Islamic Studies,Shariah,
3,Shariah and Islamic Studies,Shariah,Shariah
4,Shariah and Islamic Studies,Shariah,Shariah and Educational Preparation
...,...,...,...
7772,College of Technology at Namas,Institute of Public Administration,Banking Business
7773,College of Technology at Namas,Institute of Public Administration,Computer Programming
7774,College of Technology at Namas,Institute of Public Administration,Computer Networking
7775,College of Technology at Namas,Institute of Public Administration,Accounting


In [11]:
df1.to_csv('formattedMajors.csv')

#### Now try to use semantic matching techniques 

In [12]:
import pandas as pd
from transformers import pipeline
from sentence_transformers import SentenceTransformer
import torch
import numpy as np
import os
embed = SentenceTransformer('all-mpnet-base-v2')

  from .autonotebook import tqdm as notebook_tqdm


In [13]:
main_df = pd.DataFrame()
main_df["unit"] = df["unit"]
main_df["combined"] = df["major_label"] + ", " + df["sub_major_label"] + ", " + df["minor_label"] + ", " + df["description"]
main_flattened = main_df.loc[:, "combined"].tolist()
main_embed = embed.encode(main_flattened)



In [14]:
scraped_df = pd.DataFrame()
scraped_df["combined_scraped"] = np.where(df1.Disciplines.notnull(), df1["College"] + ", " + df1["Department"] + ", " + df1["Disciplines"], " ")
scraped_flattened = scraped_df["combined_scraped"].tolist()
scraped_embed = embed.encode(scraped_flattened)


In [15]:
df
main_flattened

['Generic Programs And Qualifications Not Further Defined, Generic Programs And Qualifications Not Further Defined, Generic Programs And Qualifications Not Further Defined, Generic Programs And Qualifications Not Further Defined',
 'Generic Programs And Qualifications Not Further Defined, Generic Programs And Qualifications Not Further Defined, Generic Programs And Qualifications Not Further Defined, The\xa0Preparatory Year\xa0',
 'Generic Programs And Qualifications Not Further Defined, Generic Programs And Qualifications Not Further Defined, Generic Programs And Qualifications Not Further Defined, \xa0Preparatory Year\xa0- Science Programs',
 'Generic Programs And Qualifications Not Further Defined, Generic Programs And Qualifications Not Further Defined, Generic Programs And Qualifications Not Further Defined, Preparatory Year- Literature Programs',
 'Generic Programs And Qualifications Not Further Defined, Generic Programs And Qualifications Not Further Defined, Generic Programs An

In [16]:
df1
scraped_flattened
scraped_df.to_csv("scraped_df_with_unit.csv")

In [17]:
main_embed

array([[ 0.00776588, -0.01778978, -0.00367176, ..., -0.02155426,
        -0.05080649, -0.00939585],
       [ 0.00427416, -0.0343257 ,  0.0014358 , ..., -0.01038648,
        -0.01060196, -0.03650972],
       [ 0.00900667, -0.0516398 ,  0.01172471, ..., -0.00251682,
        -0.006213  , -0.04392132],
       ...,
       [ 0.01858052, -0.07012565,  0.00348208, ..., -0.01628519,
         0.02225082, -0.01100876],
       [ 0.03840241, -0.01478914, -0.01332792, ..., -0.03053496,
        -0.01235545, -0.03379745],
       [ 0.03285605, -0.00035373, -0.02734045, ...,  0.0547141 ,
        -0.001158  , -0.02813016]], dtype=float32)

In [18]:
scraped_embed

array([[-0.01250334,  0.06143883, -0.00673456, ..., -0.00193858,
        -0.05036438, -0.01904943],
       [-0.0125034 ,  0.06143874, -0.00673447, ..., -0.00193857,
        -0.05036449, -0.01904942],
       [-0.0125034 ,  0.06143874, -0.00673447, ..., -0.00193857,
        -0.05036449, -0.01904942],
       ...,
       [ 0.00658361,  0.0128534 , -0.03167773, ..., -0.03434928,
         0.00554282, -0.02639209],
       [-0.01949053,  0.03752382, -0.0553725 , ..., -0.00083468,
        -0.00863802, -0.0201041 ],
       [ 0.03043541,  0.00910206, -0.03650396, ...,  0.01983908,
         0.03649983, -0.00194225]], dtype=float32)

In [19]:
sims = torch.nn.functional.normalize(torch.from_numpy(scraped_embed)) @ torch.nn.functional.normalize(torch.from_numpy(main_embed)).t()
max_idx = torch.argmax(sims, axis  = 1).numpy()
prob = pd.DataFrame(np.amax(sims.numpy(), axis = 1), columns = ['prob'])
prob

Unnamed: 0,prob
0,0.247535
1,0.247535
2,0.247535
3,0.903997
4,0.869754
...,...
7772,0.718848
7773,0.670435
7774,0.707036
7775,0.719870


In [20]:
tmp_df = pd.concat([scraped_df.reset_index(drop=True), main_df.iloc[max_idx].reset_index(drop=True)], axis = 1)
tmp_df
df3 = tmp_df

#df3.drop(df.columns[[0,1,2,3,4,5,6,7,9,10,11]], axis = 1)
#df2 = pd.concat([tmp_df, prob], axis = 1)
#df.columns
#df3=df.drop(df.columns[[0,1,2,3,4,5,6,7]], axis = 1)
df3.to_csv("initialSemanticMatching.csv")
df3

Unnamed: 0,combined_scraped,unit,combined
0,,09191101,"Health, Health Promotion, Health Promotion, He..."
1,,09191101,"Health, Health Promotion, Health Promotion, He..."
2,,09191101,"Health, Health Promotion, Health Promotion, He..."
3,"Shariah and Islamic Studies, Shariah, Shariah",04210902,"Law, Shari'ah, Shari'ah, Shari'ah And Islamic ..."
4,"Shariah and Islamic Studies, Shariah, Shariah ...",04210902,"Law, Shari'ah, Shari'ah, Shari'ah And Islamic ..."
...,...,...,...
7772,"College of Technology at Namas, Institute of P...",04120405,"Business And Administration, Banking And Finan..."
7773,"College of Technology at Namas, Institute of P...",06130102,Information And Communication Technologies (IC...
7774,"College of Technology at Namas, Institute of P...",06120102,Information And Communication Technologies (IC...
7775,"College of Technology at Namas, Institute of P...",04110104,"Business And Administration, Accounting, Accou..."


In [21]:
sims = torch.nn.functional.normalize(torch.from_numpy(scraped_embed)) @ torch.nn.functional.normalize(torch.from_numpy(main_embed)).t()
max_idx = torch.argmax(sims, axis  = 1).numpy()
prob = pd.DataFrame(np.amax(sims.numpy(), axis = 1), columns = ['prob'])
tmp_df = pd.concat([scraped_df.reset_index(drop=True), main_df.iloc[max_idx].reset_index(drop=True)], axis = 1)
a = pd.concat([tmp_df, prob], axis = 1)

a.to_csv("combined_title_mappings.csv")

In [22]:
checkSimilarities = a 
checkSimilarities= checkSimilarities[checkSimilarities['prob'] < .6]
#checkSimilarities['prob'] = checkSimilarities['prob' < .75]
checkSimilarities.to_csv("examine_smaller_probs.csv")

### Remove the college from combinations in scraped_df to see affect

In [None]:
scraped_df = pd.DataFrame()
scraped_df["combined_scraped"] = np.where(df1.Disciplines.notnull(), df1["Department"] + ", " + df1["Disciplines"], " ")
scraped_flattened = scraped_df["combined_scraped"].tolist()
scraped_embed = embed.encode(scraped_flattened)

In [None]:
sims = torch.nn.functional.normalize(torch.from_numpy(scraped_embed)) @ torch.nn.functional.normalize(torch.from_numpy(main_embed)).t()
max_idx = torch.argmax(sims, axis  = 1).numpy()
prob = pd.DataFrame(np.amax(sims.numpy(), axis = 1), columns = ['prob'])
tmp_df = pd.concat([scraped_df.reset_index(drop=True), main_df.iloc[max_idx].reset_index(drop=True)], axis = 1)
a = pd.concat([tmp_df, prob], axis = 1)

a.to_csv("mappings-without-college.csv")
checkSimilarities = a 
checkSimilarities= checkSimilarities[checkSimilarities['prob'] < .5]
checkSimilarities.to_csv("examine_smaller_probs-ALL-COMBOS.csv")

### REMOVE UNNECESSARY COLLEGE INFORMATION

In [None]:
scraped_df = pd.DataFrame()
scraped_df["College"] = np.where(df1.College.str.contains("college", case=False) | df1.College.str.contains("community", case=False), "", df1.College)
scraped_df["combined_scraped"] = np.where(df1.Disciplines.notnull(), scraped_df["College"] + " " + df1["Department"] + ", " + df1["Disciplines"], " ")
scraped_flattened = scraped_df["combined_scraped"].tolist()
scraped_df.drop(columns=["College"])
scraped_embed = embed.encode(scraped_flattened)

In [None]:
sims = torch.nn.functional.normalize(torch.from_numpy(scraped_embed)) @ torch.nn.functional.normalize(torch.from_numpy(main_embed)).t()
max_idx = torch.argmax(sims, axis  = 1).numpy()
prob = pd.DataFrame(np.amax(sims.numpy(), axis = 1), columns = ['prob'])
tmp_df = pd.concat([scraped_df.reset_index(drop=True), main_df.iloc[max_idx].reset_index(drop=True)], axis = 1)
a = pd.concat([tmp_df, prob], axis = 1)

a.to_csv("mappings-with-REDACTED-COLLEGE-LIST.csv")
checkSimilarities = a 
checkSimilarities= checkSimilarities[checkSimilarities['prob'] < .5]
checkSimilarities.to_csv("examine_smaller_probs-REDACTED-COLLEGE-LIST.csv")

### Keep editing college to remove university (no detail of location) add in "college of" as it includes keywords, as well as remove "literary education of " as for college of "Literary Education for Girls in Bisha" seems to be inaccurate translation causing incorrect mapping

In [None]:
scraped_df = pd.DataFrame()
scraped_df["College"] = np.where(df1.College.str.contains("college", case=False) | df1.College.str.contains("community", case=False) | df1.College.str.contains("literary education for girls in", case=False)| df1.College.str.contains("university", case=False), "", df1.College)

scraped_df["College"] = np.where(df1.College.str.contains("college of", case=False), df1.College, scraped_df.College)


scraped_df["combined_scraped"] = np.where(df1.Disciplines.notnull(), scraped_df["College"] + " " + df1["Department"] + ", " + df1["Disciplines"], " ")
scraped_flattened = scraped_df["combined_scraped"].tolist()
scraped_df.drop(columns=["College"])

scraped_df
scraped_embed = embed.encode(scraped_flattened)

In [None]:
sims = torch.nn.functional.normalize(torch.from_numpy(scraped_embed)) @ torch.nn.functional.normalize(torch.from_numpy(main_embed)).t()
max_idx = torch.argmax(sims, axis  = 1).numpy()
prob = pd.DataFrame(np.amax(sims.numpy(), axis = 1), columns = ['prob'])
tmp_df = pd.concat([scraped_df.reset_index(drop=True), main_df.iloc[max_idx].reset_index(drop=True)], axis = 1)
a = pd.concat([tmp_df, prob], axis = 1)

a.to_csv("mappings-with-REDACTED-COLLEGE-LIST-2.csv")
checkSimilarities = a 
checkSimilarities= checkSimilarities[checkSimilarities['prob'] < .5]
checkSimilarities.to_csv("examine_smaller_probs-REDACTED-COLLEGE-LIST-2.csv")

In [None]:
def get_different_rows(source_df, new_df):
    """Returns just the rows from the new dataframe that differ from the source dataframe"""
    merged_df = source_df.merge(new_df, indicator=True, how='outer')
    changed_rows_df = merged_df[merged_df['_merge'] == 'right_only']
    return changed_rows_df.drop('_merge', axis=1)

In [None]:
adjusted_df1 = pd.read_csv("mappings-with-REDACTED-COLLEGE-LIST.csv")
adjusted_df2 = pd.read_csv("mappings-with-REDACTED-COLLEGE-LIST-2.csv")
diff_df = pd.DataFrame
adjusted_df1.drop(columns= ["College", 'Unnamed: 0'], inplace=True)
adjusted_df2.drop(columns= ["College", 'Unnamed: 0'], inplace=True)
adjusted_df2.rename(columns={"combined" : "combined2", "combined_scraped" : "combined_scraped2", "prob": "prob2"}, inplace=True)
diff_df = pd.concat([adjusted_df1, adjusted_df2], axis=1)

diff_df = diff_df[diff_df["combined"] != diff_df["combined2"]]


#diff_df["difference"] = np.where(adjusted_df1["combined"] != adjusted_df2["combined"], ""
#diff_df["v2_title"] = adjusted_df2["combined"]

diff_df.to_csv("differences.csv")
diff_df

Unnamed: 0,combined_scraped,combined,prob,combined_scraped2,combined2,prob2
1534,"Internal and Surgical Nursing, Public Health ...","Health, Internal Medicine And Surgery Nursing,...",0.790737,College of Nursing Internal and Surgical Nursi...,"Health, Clinical Nursing, Clinical Nursing, Ge...",0.769244
1536,"Motherhood and Childhood, Internal and Surgic...","Health, Maternity And Childhood Nursing, Mater...",0.584672,"College of Nursing Motherhood and Childhood, I...","Health, Paediatrics Nursing, Paediatrics Nursi...",0.692304
1537,"Motherhood and Childhood, Motherhood and Chil...","Welfare, Family And Child Sciences, Family And...",0.529375,"College of Nursing Motherhood and Childhood, M...","Health, Maternity And Childhood Nursing, Mater...",0.662789
1538,"Emergency Nursing, Motherhood and Childhood","Health, Emergency Nursing, Emergency Nursing, ...",0.643591,"College of Nursing Emergency Nursing, Motherho...","Health, Emergency Nursing, Emergency Nursing, ...",0.707990
1541,"Critical Care Nursing, Critical Care Nursing","Health, Emergency Nursing, Emergency Nursing, ...",0.791946,"College of Nursing Critical Care Nursing, Crit...","Health, Paediatrics Critical Care Nursing , Pa...",0.696792
...,...,...,...,...,...,...
7757,"Higher Diploma in Physiotherapy, Health Infor...","Health, Physical Therapy, Physical Therapy, Ne...",0.676424,College of Technology at Namas Higher Diploma ...,"Health, Physical Therapy, Physical Therapy, Mu...",0.665998
7762,"Institute of Public Administration, Financial...","Business And Administration, Accounting, Accou...",0.724690,College of Technology at Namas Institute of Pu...,"Business And Administration, Financial Managem...",0.715628
7772,"Institute of Public Administration, Banking B...","Business And Administration, Accounting, Accou...",0.735352,College of Technology at Namas Institute of Pu...,"Business And Administration, Financial Managem...",0.669760
7775,"Institute of Public Administration, Accounting","Business And Administration, Accounting, Accou...",0.753685,College of Technology at Namas Institute of Pu...,"Business And Administration, Accounting, Accou...",0.674045


### Try mixing all of the CSVs into one to account for all of the peculiarities

In [None]:
df_a = pd.read_csv("mappings-all-combined.csv")
df_b = pd.read_csv("mappings-without-college.csv")
df_c = pd.read_csv("mappings-with-REDACTED-COLLEGE-LIST-2.csv")
df_d = pd.read_csv("mappings-with-REDACTED-COLLEGE-LIST.csv")

df_ab = pd.DataFrame()

df_ab["prob"] = ''
df_ab['combined'] = ''
df_ab['combined_scraped'] = ''

df_ab["prob"] = np.where(df_a["prob"] > df_b["prob"], df_a["prob"], df_b["prob"])
df_ab["combined"] = np.where(df_a["prob"] > df_b["prob"], df_a["combined"], df_b["combined"])
df_ab["combined_scraped"] = np.where(df_a["prob"] > df_b["prob"], df_a["combined_scraped"], df_b["combined_scraped"])
df_ab[1530:1540]


df_cd = pd.DataFrame()

df_cd["prob"] = ''
df_cd['combined'] = ''
df_cd['combined_scraped'] = ''

df_cd["prob"] = np.where(df_c["prob"] > df_d["prob"], df_c["prob"], df_d["prob"])
df_cd["combined"] = np.where(df_c["prob"] > df_d["prob"], df_c["combined"], df_d["combined"])
df_cd["combined_scraped"] = np.where(df_c["prob"] > df_d["prob"], df_c["combined_scraped"], df_d["combined_scraped"])
df_cd[1530:1540]


df_all = pd.DataFrame()
df_all["prob"] = np.where(df_ab["prob"] > df_cd["prob"], df_ab["prob"], df_cd["prob"])
df_all["combined"] = np.where(df_ab["prob"] > df_cd["prob"], df_ab["combined"], df_cd["combined"])
df_all["combined_scraped"] = np.where(df_ab["prob"] > df_cd["prob"], df_ab["combined_scraped"], df_cd["combined_scraped"])
df_all_low = df_all[df_all["prob"] < .55]
df_all_low = df_all[df_all["combined_scraped"] !=' ']
df_all_low.to_csv('lowest-matches.csv')
df_all_low

Unnamed: 0,prob,combined,combined_scraped
3,0.903997,"Law, Shari'ah, Shari'ah, Shari'ah And Islamic ...","Shariah and Islamic Studies, Shariah, Shariah"
4,0.869754,"Law, Shari'ah, Shari'ah, Shari'ah And Islamic ...","Shariah and Islamic Studies, Shariah, Shariah ..."
5,0.921684,"Law, Shari'ah, Shari'ah, Shari'ah And Islamic ...","Shariah and Islamic Studies, Shariah, Jurispru..."
6,0.902770,"Law, Shari'ah, Shari'ah, Fundamentals Of Juris...","Shariah and Islamic Studies, Shariah, Fundamen..."
7,0.843801,"Law, Shari'ah, Shari'ah, Shari'ah And Islamic ...","Shariah and Islamic Studies, Accounting, Funda..."
...,...,...,...
7772,0.735352,"Business And Administration, Accounting, Accou...","Institute of Public Administration, Banking B..."
7773,0.670435,Information And Communication Technologies (IC...,"College of Technology at Namas, Institute of P..."
7774,0.707036,Information And Communication Technologies (IC...,"College of Technology at Namas, Institute of P..."
7775,0.753685,"Business And Administration, Accounting, Accou...","Institute of Public Administration, Accounting"


### now look to 
- remove "for girls" & "for boys" because those seem to be equivalent to secondary education
- "admission is suspended" remove for the search but keep it in the main csv file

In [57]:
scraped_df = pd.DataFrame()
scraped_df["College"] = df1["College"]
scraped_df['College'] = scraped_df['College'].str.replace('admisssion is suspended', '', regex = True)
scraped_df["Department"] = df1["Department"]
scraped_df['Department'] = scraped_df['Department'].str.replace("admisssion is suspended", '', regex=True)
scraped_df["Disciplines"] = df1["Disciplines"]
scraped_df['Disciplines'] = scraped_df.Disciplines.str.replace('admisssion is suspended','', regex = True)



scraped_df["College"] = np.where(scraped_df.College.str.contains("university", case=False) | scraped_df.College.str.contains("community", case=False) | scraped_df.College.str.contains("for girls", case=False)| scraped_df.College.str.contains("for boys", case=False), "", scraped_df.College)


scraped_df["combined_scraped"] = np.where(df1.Disciplines.notnull(), scraped_df["College"] + " " + scraped_df["Department"] + ", " + scraped_df["Disciplines"], " ")
scraped_flattened = scraped_df["combined_scraped"].tolist()
scraped_df.drop(columns=["College"])

scraped_df.to_csv("checkforsuspension.csv")
#scraped_embed = embed.encode(scraped_flattened)

In [58]:
#scraped_df["Department"] = np.where(scraped_df.Department.str.contains("suspended"), "", scraped_df.Department.str.removeprefix)
scraped_df.Disciplines = scraped_df.Disciplines.replace(to_replace=r'admission is suspended-', value='', regex=True)
scraped_df.to_csv("checkforsuspension.csv")


In [30]:
sims = torch.nn.functional.normalize(torch.from_numpy(scraped_embed)) @ torch.nn.functional.normalize(torch.from_numpy(main_embed)).t()
max_idx = torch.argmax(sims, axis  = 1).numpy()
prob = pd.DataFrame(np.amax(sims.numpy(), axis = 1), columns = ['prob'])
tmp_df = pd.concat([scraped_df.reset_index(drop=True), main_df.iloc[max_idx].reset_index(drop=True)], axis = 1)
a = pd.concat([tmp_df, prob], axis = 1)

a.to_csv("mappings-with-REDACTED-COLLEGE-LIST-3.csv")

In [None]:
df_e = pd.read_csv("mappings-with-REDACTED-COLLEGE-LIST-3.csv")

df_all2 = df_all
df_all2["prob"] = np.where(df_all["prob"] > df_e["prob"], df_all["prob"], df_e["prob"])
df_all2["combined"] = np.where(df_all["prob"] > df_e["prob"], df_all["combined"], df_e["combined"])
df_all2["combined_scraped"] = np.where(df_all["prob"] > df_e["prob"], df_all["combined_scraped"], df_e["combined_scraped"])

'''
df_all["prob"] = np.where(df_all.combined_scraped.str.contains("admission is suspended", case=False), df_e["prob"],  df_all["prob"])
df_all["combined"] = np.where(df_all.combined_scraped.str.contains("admission is suspended", case=False), df_e["combined"],  df_all["combined"])
df_all["combined_scraped"] = np.where(df_all.combined_scraped.str.contains("admission is suspended", case=False), df_e["combined_scraped"], df_all["combined_scraped"])
'''

parts_to_fix = df_all2
parts_to_fix = parts_to_fix[parts_to_fix["prob"] < .55]
parts_to_fix = parts_to_fix[parts_to_fix["combined_scraped"] !=' ']
parts_to_fix.to_csv('lowest-matches.csv')
parts_to_fix


Unnamed: 0,prob,combined,combined_scraped
39,0.537038,"Programs In Business, Administration And Law N...",Higher Institute of Promotion of Virtues and P...
569,0.549625,"Business And Administration, Accounting, Accou...",The Higher Institute of Dawah and Ihtsab Accou...
955,0.543281,"Engineering And Engineering Trades, Energy Eng...",Science Engineering and Sciences Joint Program...
1294,0.546397,"Basic Programs And Qualifications, Architectur...","Digital Architectural Drawing, Visual Impairment"
3041,0.546511,"Health, Basic Medical Sciences, Basic Medical ...",admission is suspended Applied Medical Scienc...
3042,0.546511,"Health, Basic Medical Sciences, Basic Medical ...",admission is suspended Applied Medical Scienc...
3315,0.516911,Information And Communication Technologies (IC...,"Computer, Computer"
3316,0.516911,Information And Communication Technologies (IC...,"Computer, Computer"
3860,0.52199,Information And Communication Technologies (IC...,"Science, Management and lnformation Systems, M..."
4253,0.479857,"Programs In Business, Administration And Law N...","University BranchTaima, Shipping, Shipping"


^^^ this was looped over several times in order to try and find the flaws in the mapping and get them to be as close

### create a new CSV with included codes

In [None]:
updatedMapping = pd.read_csv("combinedUniversityCSV/allUniversities.csv")
updatedMapping.insert(loc=8,
          column='code',
          value='')
#updatedMapping["code"] = np.where(updatedMapping["Disciplines"].notnull, "", updatedMapping["code"])
updatedMapping.code = np.where(updatedMapping.Disciplines == "", "", a.unit)
updatedMapping
updatedMapping.to_csv('taxonomy_with_codes.csv')