In [87]:
import pandas as pd

In [88]:
def remove_string(data, columns, strings_to_remove):
    """
    Removes specified strings from the given columns of a pandas DataFrame.

    Args:
        data (pandas.DataFrame): The DataFrame to process.
        columns (list): A list of column names to remove strings from.
        strings_to_remove (list): A list of strings to remove from the columns.

    Returns:
        pandas.DataFrame: The processed DataFrame with the specified strings removed.
    """
    for string in strings_to_remove:
        for col in columns:
            # mask columns of the dataset that contain the string to remove
            mask = data[col].str.contains(string, na=False)

            # replace the string with an empty string
            data.loc[mask, col] = data.loc[mask, col].str.replace(string, '')

    return data

In [89]:
# Function to convert columns to numeric
def convert_to_numeric(dataframe, columns):
    """
    Converts specified columns in a dataframe to numeric data type.

    Parameters:
    dataframe (pandas.DataFrame): The input dataframe.
    columns (list): A list of column names to be converted.

    Returns:
    pandas.DataFrame: The dataframe with specified columns converted to numeric data type.
    """
    dataframe[columns] = dataframe[columns].apply(pd.to_numeric)
    return dataframe

In [90]:
# Function to change the decimal separator
def change_decimal_place(dataframe, columns):
    """
    Replaces commas with decimal points in the specified columns of a dataframe.

    Args:
        dataframe (pandas.DataFrame): The dataframe to modify.
        columns (str or list): The column(s) in which to replace commas with decimal points.

    Returns:
        pandas.DataFrame: The modified dataframe with commas replaced by decimal points in the specified columns.
    """
    dataframe[columns] = dataframe[columns].replace(',', '.', regex=True)
    return dataframe

In [91]:
# Function to open and preprocess ranking data for a given subject
def open_subject_data(subject):
    """
    Opens and processes the data for a specific subject.

    Parameters:
    subject (str): The name of the subject for which the data needs to be opened.

    Returns:
    pandas.DataFrame: The processed dataframe containing the subject data.
    """
    print(f'Opening and processing data for {subject}...')

    # Open the data for the given subject encoding all special characters like accents
    dataframe = pd.read_csv(f'../../data/raw/ranking_by_subject/{subject}.csv', delimiter=';', encoding='utf-8')
    #dataframe = pd.read_csv(f'../../data/raw/ranking_by_subject/{subject}.csv', delimiter=';', encoding_errors='ignore')

    # If there is international column, remove it
    if 'International' in dataframe.columns:
        dataframe.drop(columns='International', inplace=True)

    # Create mask for rows that contain "-" in '2022' and '2023' column
    if len(dataframe['2022']) != sum(dataframe['2022'].isna()):
        mask_2022 = dataframe['2022'].str.contains('-', na=False)
    mask_2023 = dataframe['2023'].str.contains('-', na=False)

    # Format the masked rows of '2022' and '2023' columns to include only the first ranking
    dataframe.loc[mask_2022, '2022'] = dataframe.loc[mask_2022, '2022'].str.split('-').str[0]
    dataframe.loc[mask_2023, '2023'] = dataframe.loc[mask_2023, '2023'].str.split('-').str[0]

    # Remove other special characters from the '2022' and '2023' columns
    dataframe = remove_string(dataframe, ['2022', '2023'], ['=', ' '])

    # Change decimal place
    dataframe = change_decimal_place(dataframe, ['Academic', 'Employer', 'Citations', 'H', 'Score'])

    # Convert columns to numeric
    dataframe = convert_to_numeric(dataframe, ['Academic', 'Employer', 'Citations', 'H', 'Score', '2022', '2023'])

    # Rename columns to include the subject
    columns_to_rename = ['Academic', 'Employer', 'Citations', 'H', 'International', 'Score', '2022', '2023']
    new_columns = [f'{column}_{subject}' for column in columns_to_rename]
    dataframe.rename(columns=dict(zip(columns_to_rename, new_columns)), inplace=True)
    print(f'Data for {subject} processed successfully.\n')

    return dataframe

In [92]:
def merge_dataframes(df1, df2):
    """
    Merge two dataframes based on the institution name as the key.

    Args:
        df1 (pandas.DataFrame): The first dataframe to be merged.
        df2 (pandas.DataFrame): The second dataframe to be merged.

    Returns:
        pandas.DataFrame: The merged dataframe.

    Raises:
        AssertionError: If there are duplicates in the 'Institution' column of the merged dataframe.
    """
    # If df1 is empty, assign the first dataframe to it
    if df1.empty:
        return df2
    else:
        # Merge the dataframes using the institution name as the key using NaN as fill value
        merged_df = df1.merge(df2, on='Institution', how='outer', suffixes=('', '_y'))

        # Merge location columns
        merged_df['Location'] = merged_df['Location'].fillna(merged_df['Location_y'])
        merged_df.drop('Location_y', axis=1, inplace=True)

        # Assert there are no duplicates in the 'Institution' column
        assert merged_df['Institution'].duplicated().sum() == 0

        return merged_df

In [93]:
# Function to open ranking data for multiple subjects
def open_ranking_data(subjects):
    """
    Opens and merges ranking data for the given subjects.

    Parameters:
    subjects (list): A list of subjects for which ranking data needs to be opened.

    Returns:
    pandas.DataFrame: The merged dataframe containing ranking data for all the subjects.
    """
    df = pd.DataFrame()

    for subject in subjects:
        dataframe_name = f'df_{subject}'
        globals()[dataframe_name] = open_subject_data(subject)
        df = merge_dataframes(df, globals()[dataframe_name])

        del globals()[dataframe_name]

    return df

In [94]:
# Open subjects.txt file
with open('../../data/raw/subjects.txt', 'r') as file:
    subjects = file.read().splitlines()

# Make subjects into a list
subjects = [subject.lower() for subject in subjects]

# Open and process ranking data
ranking_data = open_ranking_data(subjects)
ranking_data.head()

Opening and processing data for accounting_&_finance...
Data for accounting_&_finance processed successfully.

Opening and processing data for business_&_management...
Data for business_&_management processed successfully.

Opening and processing data for communication_&_media...
Data for communication_&_media processed successfully.

Opening and processing data for computer_science...
Data for computer_science processed successfully.

Opening and processing data for development_studies...
Data for development_studies processed successfully.

Opening and processing data for economics_&_econometrics...
Data for economics_&_econometrics processed successfully.

Opening and processing data for mathematics...
Data for mathematics processed successfully.

Opening and processing data for social_sciences...
Data for social_sciences processed successfully.

Opening and processing data for statistics_&_operational_research...
Data for statistics_&_operational_research processed successfully.



Unnamed: 0,2023_accounting_&_finance,2022_accounting_&_finance,Institution,Location,Academic_accounting_&_finance,Employer_accounting_&_finance,Citations_accounting_&_finance,H_accounting_&_finance,Score_accounting_&_finance,2023_business_&_management,...,Citations_social_sciences,H_social_sciences,Score_social_sciences,2023_statistics_&_operational_research,2022_statistics_&_operational_research,Academic_statistics_&_operational_research,Employer_statistics_&_operational_research,Citations_statistics_&_operational_research,H_statistics_&_operational_research,Score_statistics_&_operational_research
0,,,AGH University of Science and Technology,Poland,,,,,,,...,,,,,,,,,,
1,,,Aalborg University,Denmark,,,,,,351.0,...,80.9,68.4,,,,,,,,
2,151.0,101.0,Aalto University,Finland,67.7,54.5,70.5,73.4,,76.0,...,90.4,79.0,67.1,,,,,,,
3,201.0,151.0,Aarhus University,Denmark,62.4,57.2,73.4,69.9,,201.0,...,83.4,75.8,68.8,101.0,101.0,67.7,62.2,81.6,79.4,
4,,,Abu Dhabi University,United Arab Emirates,,,,,,251.0,...,66.5,60.7,,,,,,,,


In [95]:
# If the last character of the 'Institution' column is a space, remove it
ranking_data['Institution'] = ranking_data['Institution'].str.rstrip()

# Change all institution names to lowercase
ranking_data['Institution'] = ranking_data['Institution'].str.lower()

# Change all words that start with 'univers' to 'university'
ranking_data['Institution'] = ranking_data['Institution'].str.replace(r'\bunivers\w+', 'university', regex=True)

# Change all special characters to lowercase normal (ex, á -> a)
ranking_data['Institution'] = ranking_data['Institution'].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')

In [96]:
# Save the processed data
ranking_data.to_csv('../../data/processed/processed_ranking_data.csv', index=False)