# UFCFVQ-15-M Programming for Data Science (Autumn 2023)


## Student Id: 23020764

# Programming Task 1

### Requirement FR1 - Develop a function to read a single column from a CSV file

In [1]:
def read_single_column_csv(file_name, column_number):
    """
    Reads a single column from a CSV file
    
    Parameters:
    - file_name (str): the name of the CSV file
    - column_number (int): the index of the chosen column  
    
    Returns:
    - str: the column name from the header
    - list: the list containing the data from specified column
    
    """
    # Ensure to give appropriate errors to foreseen problems that may arise from user mistake.    
    if column_number < 0:
        raise ValueError("Column number must be equal or greater than 0")
    
    # Identify empty variables to store the column name and data
    column_name = None
    column_data = []
    
    # Open the CSV file
    with open(file_name, 'r', encoding='utf-8-sig') as file:
        
        # Read the header, the first row, to extract the column names
        header = file.readline().strip()
        # Remove BOM if exists
        header = header.lstrip('\ufeff')
        column_names = header.split(',')
    
        # Check if the specifies column number is valid for the dataset or not
        if column_number >= len(column_names):
            raise ValueError("Column number is not in the range of columns for this dataset")
    
        # Set the first return a.k.a the column name
        column_name = column_names[column_number]
        
        # Go through every row
        for line in file:
            columns = line.strip().split(',')
            # Append the data from corresponding column to the list
            if column_number < len(column_names):
                column_data.append(columns[column_number])
    return column_name, column_data 


# to check, assuming this jupiter file and csv file is in the same folder:

file_name, column_number = "task1.csv", 0

column_name, column_data = read_single_column_csv(file_name, column_number)
print(column_name)
print(column_data)


cancer
['70', '70', '68', '53', '75', '69', '70', '63', '73', '66', '70', '63', '68', '69', '72', '65', '69', '61', '68', '67', '68', '65', '65', '68', '64', '63', '68', '69', '61', '72', '63', '75', '67', '72', '63', '69', '73', '70', '69', '73', '69', '73', '65', '70', '64', '64', '63', '68', '70', '70', '70', '62', '81', '69', '72', '69', '66', '67', '70', '84', '65', '65', '63', '81', '65', '67', '66', '67', '61', '76', '66', '70', '67', '70', '73', '63', '62', '82', '75', '65', '74', '68', '81', '76', '57', '65', '62', '64', '65', '63', '69', '65', '100', '65', '62', '66', '64', '61', '64', '60', '55', '64', '57', '63', '59', '66', '68', '70', '70', '51', '58', '57', '55', '68', '67', '72', '67', '58', '61', '60', '58', '67', '68', '66', '62', '59', '66', '64', '60', '63', '65', '55', '56', '63', '59', '60', '65', '73', '65', '65', '61', '64', '65', '63', '70', '59', '67', '68', '66', '64', '64', '65', '68', '57', '68', '65', '64', '66', '72', '68', '67', '64', '67', '57', '59', '

### Requirement FR2 - Develop a function to read CSV data from a file into memory

In [3]:
def read_csv_to_memory_dict(file_name):
    """
    Reads CSV file data into the memory as a dictionary
    
    Parameters:
    - file_name (str): the name of the CSV file
    
    Returns:
    - dict: a dictionary where each key is a column name and the value is the list of the column data.
    
    """

    # Ensure to give appropriate errors to foreseen problems that may arise from user mistake.    
    if not file_name:
        raise ValueError("Please provide the file name")    

    try:

        with open(file_name, 'r', encoding='utf-8-sig') as file:
        # Identify an empty dictionary to store the column name and data
            data_dict = {}
            # Read the header, the first row, to extract the column names
            header = file.readline().strip()
            # Remove BOM if exists
            header = header.lstrip('\ufeff')
            column_names = header.split(',')

            # Loop to get every column
            for column_number in range(len(column_names)):
                # Use the pre written function to get the name and the data
                column_name, column_data = read_single_column_csv(file_name, column_number)
                # Add them to the dictionary but print every column in new line
                data_dict[column_name] = column_data
          
            return data_dict
    

    except FileNotFoundError:
        print(f"File {file_name} not found, consider checking the path")
    except Exception as e:
        print(f"Error: {e}")
    
# to check:

file_name = "task1.csv"

memory = read_csv_to_memory_dict(file_name)
print(memory)
print(len(memory))
print(memory["cancer"])        

{'cancer': ['70', '70', '68', '53', '75', '69', '70', '63', '73', '66', '70', '63', '68', '69', '72', '65', '69', '61', '68', '67', '68', '65', '65', '68', '64', '63', '68', '69', '61', '72', '63', '75', '67', '72', '63', '69', '73', '70', '69', '73', '69', '73', '65', '70', '64', '64', '63', '68', '70', '70', '70', '62', '81', '69', '72', '69', '66', '67', '70', '84', '65', '65', '63', '81', '65', '67', '66', '67', '61', '76', '66', '70', '67', '70', '73', '63', '62', '82', '75', '65', '74', '68', '81', '76', '57', '65', '62', '64', '65', '63', '69', '65', '100', '65', '62', '66', '64', '61', '64', '60', '55', '64', '57', '63', '59', '66', '68', '70', '70', '51', '58', '57', '55', '68', '67', '72', '67', '58', '61', '60', '58', '67', '68', '66', '62', '59', '66', '64', '60', '63', '65', '55', '56', '63', '59', '60', '65', '73', '65', '65', '61', '64', '65', '63', '70', '59', '67', '68', '66', '64', '64', '65', '68', '57', '68', '65', '64', '66', '72', '68', '67', '64', '67', '57', '59

### Requirement FR3 - Develop a function to calculate the Kendall Tau Correlation Coefficient for two lists of data

In [7]:
def kendalls_tau_b(x, y):
    
    concordant = discordant = ties_x = ties_y = 0
    n = len(x)

    if len(x) != len(y):
        raise ValueError("The lists should be of the same length")

    # Count concordants and discordants
    for i in range(n-1):
        for j in range(i+1, n):
            if x[i] == x[j] and y[i] == y[j]:
                # Ignore pairs that are tied in both lists
                continue
            elif (x[i] < x[j] and y[i] < y[j]) or (x[i] > x[j] and y[i] > y[j]):
                concordant += 1
            elif (x[i] < x[j] and y[i] > y[j]) or (x[i] > x[j] and y[i] < y[j]):
                discordant += 1
            
            # Count ties in list1
            if x[i] == x[j]:
                ties_x += 1
            
            # Count ties in list2
            if y[i] == y[j]:
                ties_y += 1
    
    # Calculate tau_b
    P = concordant
    Q = discordant
    T = ties_x
    U = ties_y

    denominator = (((P + Q + T) * (P + Q + U)) ** 0.5)
    if denominator == 0:
        return None  # Return None if the denominator is zero to avoid division by zero error
    
    tau_b = (P - Q) / denominator

    return tau_b


# Example usage
x = memory["cancer"]
y = memory["stroke"]

c = kendalls_tau_b(x, y)
print("Mine function returns:", c)

from scipy.stats import kendalltau
corr, _ = kendalltau(x, y)
print('Scipy built-in function returns: %.5f' % corr)



Mine function returns: 0.2586935794614296
Scipy built-in function returns: 0.25869


### Requirement FR4 - Develop a function to generate a set of Kendall Tau Correlation Coefficients for a data structure like the one generated in FR2

In [8]:
def set_of_kendall_tau(data):
    """
    Generate Kendall Tau Rank Correlation Coefficients for every pair of columns from the previous dictionary.

    Parameters:
    data (dict): Dictionary where keys are column names, and values are lists of data.

    Returns:
    list: List of tuples containing column pairs and their associated correlation coefficient values.
    """
    column_names = list(data.keys())
    num_columns = len(column_names)
    correlations = []

    for i in range(num_columns - 1):
        for j in range(i + 1, num_columns):
            column1 = column_names[i]
            column2 = column_names[j]
            values1 = data[column1]
            values2 = data[column2]

            correlation_coefficient = kendalls_tau_b(values1, values2)
            correlations.append((column1, column2, correlation_coefficient))

    return correlations

# to check

set_of_kendall_tau(memory)

[('cancer', 'cardiovascular', 0.11657256050768439),
 ('cancer', 'stroke', 0.2586935794614296),
 ('cancer', 'depression', 0.23928224488351915),
 ('cancer', 'rehab', 0.2997925331933315),
 ('cancer', 'vaccine', 0.15643680998934015),
 ('cancer', 'diarrhea', 0.23149211765241354),
 ('cancer', 'obesity', 0.13440662133322778),
 ('cancer', 'diabetes', 0.17280145563866803),
 ('cardiovascular', 'stroke', 0.06397863962062682),
 ('cardiovascular', 'depression', 0.030399564597656043),
 ('cardiovascular', 'rehab', 0.1623021558178698),
 ('cardiovascular', 'vaccine', -0.09876137307999196),
 ('cardiovascular', 'diarrhea', -0.02141430756654661),
 ('cardiovascular', 'obesity', 0.17324358360959338),
 ('cardiovascular', 'diabetes', 0.11106647678610486),
 ('stroke', 'depression', 0.2465946318402626),
 ('stroke', 'rehab', 0.14352871455131197),
 ('stroke', 'vaccine', 0.0831514808975091),
 ('stroke', 'diarrhea', 0.2617003002004919),
 ('stroke', 'obesity', 0.05702361260874357),
 ('stroke', 'diabetes', 0.11457579

### Requirement FR5 - Develop a function to print a custom table for selected data from a data structure like the one generated in FR4

In [11]:
def custom_table(correlations, border_char='*', include_columns=None):
    """
    Print a custom table of Kendall Tau Rank Correlation Coefficients.

    Parameters:
    correlation_results (list): List of tuples containing column pairs and their correlation coefficient values from FR4.
    border_char (str): Border character to use for the table.
    include_columns (list): List of column names to include in the table.

    Returns:
    None (table will be printed)
    """
    if include_columns is None:
        include_columns = [corr[0] for corr in correlations] + [corr[1] for corr in correlations]

    # Get the unique column names
    column_names = set(include_columns)

    # Sort the column names alphabetically
    sorted_column_names = sorted(column_names)

    # Create a table header
    header = border_char + ' ' * 15 + border_char + border_char.join(f"{col:^15}" for col in sorted_column_names) + border_char
    separator = border_char + '*' * (15 * (len(sorted_column_names) + 1) + len(sorted_column_names) - 1) + border_char

    # Print the table header
    print(separator)
    print(header)
    print(separator)

    # Print the table content
    for row_label in sorted_column_names:
        row = f"{row_label:^16}" + border_char
        for col_label in sorted_column_names:
            if row_label == col_label:
                cell_value = "-".center(15)
            else:
                correlation_value = next(
                    corr[2] for corr in correlations if (corr[0] == row_label and corr[1] == col_label) or
                                                       (corr[1] == row_label and corr[0] == col_label))
                cell_value = f"{correlation_value: .4f}".center(15)
            row += cell_value + border_char
        print(row)
        print(separator)

        

# to check
correlation_results = set_of_kendall_tau(memory)
selected_columns  = ["cancer", "vaccine", "diarrhea", "depression"]
custom_table(correlation_results, border_char='*', include_columns = selected_columns) 

********************************************************************************
*               *    cancer     *  depression   *   diarrhea    *    vaccine    *
********************************************************************************
     cancer     *       -       *     0.2393    *     0.2315    *     0.1564    *
********************************************************************************
   depression   *     0.2393    *       -       *     0.3007    *     0.3045    *
********************************************************************************
    diarrhea    *     0.2315    *     0.3007    *       -       *     0.0548    *
********************************************************************************
    vaccine     *     0.1564    *     0.3045    *     0.0548    *       -       *
********************************************************************************
