In [34]:
import pandas as pd

In [35]:
def remove_string(data, columns, strings_to_remove):
    """
    Removes specified strings from the given columns of a pandas DataFrame.

    Args:
        data (pandas.DataFrame): The DataFrame to process.
        columns (list): A list of column names to remove strings from.
        strings_to_remove (list): A list of strings to remove from the columns.

    Returns:
        pandas.DataFrame: The processed DataFrame with the specified strings removed.
    """
    for string in strings_to_remove:
        for col in columns:
            # mask columns of the dataset that contain the string to remove
            mask = data[col].str.contains(string, na=False)

            # replace the string with an empty string
            data.loc[mask, col] = data.loc[mask, col].str.replace(string, '')

    return data

In [36]:
# Function to convert columns to numeric
def convert_to_numeric(dataframe, columns):
    """
    Converts specified columns in a dataframe to numeric data type.

    Parameters:
    dataframe (pandas.DataFrame): The input dataframe.
    columns (list): A list of column names to be converted.

    Returns:
    pandas.DataFrame: The dataframe with specified columns converted to numeric data type.
    """
    dataframe[columns] = dataframe[columns].apply(pd.to_numeric)
    return dataframe

In [37]:
# Function to change the decimal separator
def change_decimal_place(dataframe, columns):
    """
    Replaces commas with decimal points in the specified columns of a dataframe.

    Args:
        dataframe (pandas.DataFrame): The dataframe to modify.
        columns (str or list): The column(s) in which to replace commas with decimal points.

    Returns:
        pandas.DataFrame: The modified dataframe with commas replaced by decimal points in the specified columns.
    """
    dataframe[columns] = dataframe[columns].replace(',', '.', regex=True)
    return dataframe

In [38]:
# Function to open and preprocess ranking data for a given subject
def open_subject_data(subject):
    """
    Opens and processes the data for a specific subject.

    Parameters:
    subject (str): The name of the subject for which the data needs to be opened.

    Returns:
    pandas.DataFrame: The processed dataframe containing the subject data.
    """
    dataframe = pd.read_csv(f'../../data/raw/{subject}.csv', delimiter=';', encoding_errors='ignore')

    # Create mask for rows that contain "-" in '2022' and '2023' column
    mask_2022 = dataframe['2022'].str.contains('-', na=False)
    mask_2023 = dataframe['2023'].str.contains('-', na=False)

    # Format the masked rows of '2022' and '2023' columns to include only the first ranking
    dataframe.loc[mask_2022, '2022'] = dataframe.loc[mask_2022, '2022'].str.split('-').str[0]
    dataframe.loc[mask_2023, '2023'] = dataframe.loc[mask_2023, '2023'].str.split('-').str[0]

    # Remove other special characters from the '2022' and '2023' columns
    dataframe = remove_string(dataframe, ['2022', '2023'], ['=', ' '])

    # Change decimal place
    dataframe = change_decimal_place(dataframe, ['Academic', 'Employer', 'Citations', 'H', 'International', 'Score'])

    # Convert columns to numeric
    dataframe = convert_to_numeric(dataframe, ['Academic', 'Employer', 'Citations', 'H', 'International', 'Score', '2022', '2023'])

    # Rename columns to include the subject
    columns_to_rename = ['Academic', 'Employer', 'Citations', 'H', 'International', 'Score', '2022', '2023']
    new_columns = [f'{column}_{subject}' for column in columns_to_rename]
    dataframe.rename(columns=dict(zip(columns_to_rename, new_columns)), inplace=True)

    return dataframe

In [39]:
def merge_dataframes(df1, df2):
    """
    Merge two dataframes based on the institution name as the key.

    Args:
        df1 (pandas.DataFrame): The first dataframe to be merged.
        df2 (pandas.DataFrame): The second dataframe to be merged.

    Returns:
        pandas.DataFrame: The merged dataframe.

    Raises:
        AssertionError: If there are duplicates in the 'Institution' column of the merged dataframe.
    """
    # If df1 is empty, assign the first dataframe to it
    if df1.empty:
        return df2
    else:
        # Merge the dataframes using the institution name as the key using NaN as fill value
        merged_df = df1.merge(df2, on='Institution', how='outer', suffixes=('', '_y'))

        # Merge location columns
        merged_df['Location'] = merged_df['Location'].fillna(merged_df['Location_y'])
        merged_df.drop('Location_y', axis=1, inplace=True)

        # Assert there are no duplicates in the 'Institution' column
        assert merged_df['Institution'].duplicated().sum() == 0

        return merged_df

In [40]:
# Function to open ranking data for multiple subjects
def open_ranking_data(subjects):
    """
    Opens and merges ranking data for the given subjects.

    Parameters:
    subjects (list): A list of subjects for which ranking data needs to be opened.

    Returns:
    pandas.DataFrame: The merged dataframe containing ranking data for all the subjects.
    """
    df = pd.DataFrame()

    for subject in subjects:
        dataframe_name = f'df_{subject}'
        globals()[dataframe_name] = open_subject_data(subject)
        df = merge_dataframes(df, globals()[dataframe_name])

    return df

In [41]:
# List of subjects
subjects = ['mathematics', 'computer_science']

# Open and process ranking data
ranking_data = open_ranking_data(subjects)
ranking_data.head()

Unnamed: 0,2023_mathematics,2022_mathematics,Institution,Location,Academic_mathematics,Employer_mathematics,Citations_mathematics,H_mathematics,International_mathematics,Score_mathematics,2023_computer_science,2022_computer_science,Academic_computer_science,Employer_computer_science,Citations_computer_science,H_computer_science,International_computer_science,Score_computer_science
0,401.0,501.0,AGH University of Science and Technology,Poland,48.9,50.9,71.7,67.1,59.4,,501.0,551.0,45.4,49.8,65.8,62.5,64.0,
1,251.0,351.0,Aalborg University,Denmark,52.2,53.9,78.1,70.3,81.3,,151.0,151.0,54.0,55.8,81.1,82.8,96.4,
2,151.0,201.0,Aalto University,Finland,55.7,60.2,84.6,81.6,68.0,,84.0,84.0,67.2,57.6,83.6,81.3,92.1,69.4
3,251.0,251.0,Aarhus University,Denmark,57.1,53.5,82.3,72.7,56.1,,140.0,107.0,65.3,58.5,78.0,70.6,72.5,65.9
4,,,Ain Shams University,Egypt,,,,,,,351.0,351.0,45.0,66.5,67.5,59.0,42.2,


In [42]:
# Save the processed data
ranking_data.to_csv('../../data/processed/ranking_data.csv', index=False)