# Programming for Data Science


# Programming Task 1: You MUST NOT import any Python library functions for this task.

### Requirement FR1 - Develop a function to read a single column from a CSV file
### Description: The function should accept two parameters: the data file name and a column number. The column number specifies which of the columns to read. It can range between 0 and n-1 (where n is the number of columns). The function should return two values: the column name and a List containing all the specified column’s data values. You should use the task1.csv data file to test your function but your function should also work for other CSV files. 

In [1]:
# Define a function to read a specific column from a CSV file and return its name and values
def read_column (csv_file, column_number):

# Open CSV file in read mode with UTF-8 encoding to handle special characters
    with open (csv_file, "r", encoding="utf-8-sig") as file:

        # Read the first line of CSV file to extract names
        first_line = file.readline().strip().split(",")

        # Check if the column number is in the range
        if 0 <= column_number < len(first_line):

            # Find column name
            column_name = first_line[column_number].strip().lower()

            # Create an empty list for values in the selected column
            column_values = []

            # Iterate over the remaining lines in the file
            for line in file:

                # Separate values in the lines based on commas
                values = line.strip().split(",")

                # Add values from the selected column to the created list
                column_values.append(values[column_number].strip())

            # Return column name and column values
            return (column_name,column_values)
        
        # Raise an error if selected column number is not in the range
        else:
            raise ValueError ("column number is out of range")

# Test the function
        
x , y = read_column ("task1.csv", 0)
print (f"Column Name: {x} \nColumn Values: {y}")

Column Name: cancer 
Column Values: ['70', '70', '68', '53', '75', '69', '70', '63', '73', '66', '70', '63', '68', '69', '72', '65', '69', '61', '68', '67', '68', '65', '65', '68', '64', '63', '68', '69', '61', '72', '63', '75', '67', '72', '63', '69', '73', '70', '69', '73', '69', '73', '65', '70', '64', '64', '63', '68', '70', '70', '70', '62', '81', '69', '72', '69', '66', '67', '70', '84', '65', '65', '63', '81', '65', '67', '66', '67', '61', '76', '66', '70', '67', '70', '73', '63', '62', '82', '75', '65', '74', '68', '81', '76', '57', '65', '62', '64', '65', '63', '69', '65', '100', '65', '62', '66', '64', '61', '64', '60', '55', '64', '57', '63', '59', '66', '68', '70', '70', '51', '58', '57', '55', '68', '67', '72', '67', '58', '61', '60', '58', '67', '68', '66', '62', '59', '66', '64', '60', '63', '65', '55', '56', '63', '59', '60', '65', '73', '65', '65', '61', '64', '65', '63', '70', '59', '67', '68', '66', '64', '64', '65', '68', '57', '68', '65', '64', '66', '72', '68', '6

### Requirement FR2 - Develop a function to read CSV data from a file into memory
### Description: The task1.csv data file contains several columns of data values. This function should accept a single parameter: the data file name. It should make use of the function developed in FR1 to read all columns of data from the data file and add them to a Dictionary data structure. The Dictionary should contain one entry for each column in the CSV data file. 

In [2]:
# Define a function to read data from a CSV file and return it as a dictionary
def read_file (csv_file):

    # Open CSV file in read mode
    with open (csv_file, "r", encoding="utf-8-sig") as file:

        # Read the header line of the CSV file
        first_line = file.readline().strip().split(",")

        # Create an empty dictionary with column names as keys and empty lists as values
        data_dict = {c:[] for c in first_line}

        # Iterate over the range of numbers (based on number of columns)
        for n in range (len(first_line)):
            
            # Use function from FR1 to read name and value of each column
            column_name, column_values = read_column(csv_file, n)

            # Add names and values to the created dictionary
            data_dict[column_name] = [float(value) for value in column_values]
                
        # Return CSV data as a dictionary
        return (data_dict)
    
# Test the function
    
data_structure = read_file ("task1.csv")
print (data_structure)

{'cancer': [70.0, 70.0, 68.0, 53.0, 75.0, 69.0, 70.0, 63.0, 73.0, 66.0, 70.0, 63.0, 68.0, 69.0, 72.0, 65.0, 69.0, 61.0, 68.0, 67.0, 68.0, 65.0, 65.0, 68.0, 64.0, 63.0, 68.0, 69.0, 61.0, 72.0, 63.0, 75.0, 67.0, 72.0, 63.0, 69.0, 73.0, 70.0, 69.0, 73.0, 69.0, 73.0, 65.0, 70.0, 64.0, 64.0, 63.0, 68.0, 70.0, 70.0, 70.0, 62.0, 81.0, 69.0, 72.0, 69.0, 66.0, 67.0, 70.0, 84.0, 65.0, 65.0, 63.0, 81.0, 65.0, 67.0, 66.0, 67.0, 61.0, 76.0, 66.0, 70.0, 67.0, 70.0, 73.0, 63.0, 62.0, 82.0, 75.0, 65.0, 74.0, 68.0, 81.0, 76.0, 57.0, 65.0, 62.0, 64.0, 65.0, 63.0, 69.0, 65.0, 100.0, 65.0, 62.0, 66.0, 64.0, 61.0, 64.0, 60.0, 55.0, 64.0, 57.0, 63.0, 59.0, 66.0, 68.0, 70.0, 70.0, 51.0, 58.0, 57.0, 55.0, 68.0, 67.0, 72.0, 67.0, 58.0, 61.0, 60.0, 58.0, 67.0, 68.0, 66.0, 62.0, 59.0, 66.0, 64.0, 60.0, 63.0, 65.0, 55.0, 56.0, 63.0, 59.0, 60.0, 65.0, 73.0, 65.0, 65.0, 61.0, 64.0, 65.0, 63.0, 70.0, 59.0, 67.0, 68.0, 66.0, 64.0, 64.0, 65.0, 68.0, 57.0, 68.0, 65.0, 64.0, 66.0, 72.0, 68.0, 67.0, 64.0, 67.0, 57.0, 59.

### Requirement FR3 - Develop a function to calculate the Kendall Tau Correlation Coefficient for two lists of data
### Description: This function should calculate the Kendall Tau Rank Correlation Coefficient for two lists of data. The function should take two lists of data (of equal length) as parameters. The function should ensure that the lists are of equal length otherwise raise an error. The function should return the calculated coefficient value. 

In [3]:
# Define a function to calculate the Kendall Tau Correlation Coefficient for two lists
def KTCC (L1, L2):
    
    # Ensure both lists have equal lengths and raise an error if not
    if len(L1) != len(L2):
        raise ValueError ("Lists must have equal lengths")
    
    # Store the length of the lists
    n = len(L1)

    # Give initial value of zero to both numbers of concordant and discordant pairs
    concordant_pairs, discordant_pairs = 0, 0

    # Using nested loops to iterate over pairs
    for i in range(n - 1):
        for j in range(i + 1, n):

            # Check the difference between observations in the lists and count concordant and discordant pairs
            if (L1[i]-L1[j])*(L2[i]-L2[j])>0:
                concordant_pairs+=1
                
            elif (L1[i]-L1[j])*(L2[i]-L2[j])<0:
                discordant_pairs+=1

    # Count the number of tie pairs in each list
    tie_count_L1 = sum([L1.count(x) * (L1.count(x) - 1) // 2 for x in set(L1)])
    tie_count_L2 = sum([L2.count(x) * (L2.count(x) - 1) // 2 for x in set(L2)])

    # Calculate the total number of pair combinations
    total_pairs = n * (n - 1) // 2

    # Adjust the denominator of the KTCC formula to account for ties
    denominator = ((total_pairs - tie_count_L1) * (total_pairs - tie_count_L2)) ** 0.5

    # Calculate Kendall Tau Correlation Coefficient
    if denominator == 0:
        return 0  
    else:
        Kendall_Tau = (concordant_pairs - discordant_pairs) / denominator

    return round (Kendall_Tau, 4)

# Testing the custom KTCC function against SciPy's kendalltau function for comparison

list1 = [100,100,25,410,1]
list2 = [0,45,13,2,1520]
k1 = KTCC (list1,list2)
print (k1)

import scipy as sp
k2 = sp.stats.kendalltau (list1,list2)
print (k2)

-0.527
SignificanceResult(statistic=-0.5270462766947299, pvalue=0.206507295485425)


Kendall's Tau is a correlation coefficient and is thus a measure of the relationship between two variables.  It is a non-parametric test (does not assume a specific distribution for the data) that assesses the strength and direction of the relationship between two variables, making it suitable for data that does not necessarily follow a normal distribution. The Kendall Tau correlation coefficient is calculated based on the number of concordant and discordant pairs of observations in the data. A pair of observations is considered concordant if the ranks for both elements agree in the two data sets. Conversely, they are discordant if the ranks disagree.
In my code, the denominator in the formula adjusts for ties in the data. Ties can affect the total number of pairs that can be formed, and therefore, they need to be accounted for in the calculation. If the denominator is zero (which can occur if all the elements in one or both lists are identical), the coefficient is set to 0, as correlation in such cases is not defined. The final Kendall Tau correlation coefficient is a value between -1 and 1, where 1 implies a perfect positive correlation, -1 implies a perfect negative correlation, and 0 implies no correlation.

### Requirement FR4 - Develop a function to generate a set of Kendall Tau Correlation Coefficients for a data structure like the one generated in FR2
### Description: The function should accept one parameter: the Dictionary data structure generated in FR2. This function should make use of the function developed in FR3 to generate a Kendall Tau Rank Correlation Coefficient for every pair of columns in the input data structure parameter. The function should return a list of tuples, each tuple containing the two column names and associated correlation coefficient value. 

In [4]:
# Define a function to calculate Kendall Tau coefficients for all pairs of columns in a given data structure
def KTCC_SET (data):

    # Convert the keys of the dictionary to a list to represent column names
    keys = list (data.keys())

    # Create an empty list for coefficents
    coef_list = []

    # Iterate over pairs of columns, Using nested loops
    for i in range (len(keys)):
        for j in range (i+1, len(keys)):

            # Extract data for each column pair to create two comparison lists
            L1 = data [keys[i]]
            L2 = data [keys[j]]

            # Use function from FR3 to calculate coefficients
            kendall = KTCC (L1,L2)

            # Add the name of pairs and their coefficient to created list
            coef_list.append ((keys[i],keys[j],kendall))
            
    # Return the list
    return coef_list

# Test the function with the data structure resulted from FR2

result = KTCC_SET (data_structure)
print (result)

[('cancer', 'cardiovascular', 0.1445), ('cancer', 'stroke', 0.2625), ('cancer', 'depression', 0.2749), ('cancer', 'rehab', 0.2662), ('cancer', 'vaccine', 0.1938), ('cancer', 'diarrhea', 0.2644), ('cancer', 'obesity', 0.1349), ('cancer', 'diabetes', 0.1795), ('cardiovascular', 'stroke', 0.053), ('cardiovascular', 'depression', 0.0057), ('cardiovascular', 'rehab', 0.1599), ('cardiovascular', 'vaccine', -0.0852), ('cardiovascular', 'diarrhea', -0.0333), ('cardiovascular', 'obesity', 0.1784), ('cardiovascular', 'diabetes', 0.1623), ('stroke', 'depression', 0.2551), ('stroke', 'rehab', 0.1291), ('stroke', 'vaccine', 0.1136), ('stroke', 'diarrhea', 0.2757), ('stroke', 'obesity', 0.0554), ('stroke', 'diabetes', 0.1707), ('depression', 'rehab', -0.016), ('depression', 'vaccine', 0.3091), ('depression', 'diarrhea', 0.3227), ('depression', 'obesity', 0.257), ('depression', 'diabetes', 0.1045), ('rehab', 'vaccine', -0.0658), ('rehab', 'diarrhea', 0.1413), ('rehab', 'obesity', -0.0382), ('rehab', 

### Requirement FR5 - Develop a function to print a custom table for selected data from a data structure like the one generated in FR4
### Description: This function should output the Kendall Tau Rank Correlation Coefficient for a subset of the column pairs generated in FR4. The function should take three parameters: list of correlation coefficient tuples, border character to use and which columns to include. 

In [5]:
# Define a function to get data structure, border character, and selected variables and print a table of coefficients
def KTCC_TABLE (data, border_char, selected_columns):

    # Filter results in data structure to include only selected variables
    selected_results = {(col1, col2): Kendall for col1, col2, Kendall in data if col1 in selected_columns and col2 in selected_columns}

    # Calculate length for each cell: based on the maximum length between selected variable names (plus 2 for spaces on right and left sides)
    max_variable_length = max(len(col) for col in selected_columns) + 2

    # Print top header boarder line (based on number of selected variables, maximum variable length and number of boarder character between names)
    print (" " * (max_variable_length + 1) + border_char * (len(selected_columns) * (max_variable_length) + (len(selected_columns) + 1)))

    # Print table header

    # Create an empty space above the first column
    header = " " * (max_variable_length + 1) + border_char

    # Add selected variable names and boarder character between them to the header and print
    for col in selected_columns:
        header += f"{col.center(max_variable_length)}{border_char}"
    print (header)

    # Print bottom header border line
    print (border_char * ((len(selected_columns) + 1) * (max_variable_length) + (len(selected_columns) + 2)))

    # Iterate over selected columns to print each row of the table
    for col1 in selected_columns:

        # Set first column with selected variable names and boatder character on both sides of them
        row = f"{border_char}{col1.center(max_variable_length)}{border_char}"

        # Iterate over selected columns to find pair of variables
        for col2 in selected_columns:
            
            # Put "-" in cells with the same column and row name
            if col1 == col2:
                row += f"{'-'.center(max_variable_length)}{border_char}"

            # Find coefficients for each pair in selected columns and add them to the rows
            else:
                Kendall = selected_results[(col1, col2)] if (col1, col2) in selected_results else selected_results[(col2, col1)]
                row += f"{str(Kendall).center(max_variable_length)}{border_char}"
        print (row)

    # Print bottom table border line
    print (border_char * ((len(selected_columns) + 1) * (max_variable_length) + (len(selected_columns) + 2)))

# Test the function with the result from FR4
    
variables = ["cancer", "depression", "cardiovascular", "rehab", "vaccine"]
boarder = "*"
KTCC_TABLE (result, boarder, variables)

                 **************************************************************************************
                 *     cancer     *   depression   * cardiovascular *     rehab      *    vaccine     *
*******************************************************************************************************
*     cancer     *       -        *     0.2749     *     0.1445     *     0.2662     *     0.1938     *
*   depression   *     0.2749     *       -        *     0.0057     *     -0.016     *     0.3091     *
* cardiovascular *     0.1445     *     0.0057     *       -        *     0.1599     *    -0.0852     *
*     rehab      *     0.2662     *     -0.016     *     0.1599     *       -        *    -0.0658     *
*    vaccine     *     0.1938     *     0.3091     *    -0.0852     *    -0.0658     *       -        *
*******************************************************************************************************
