In [2]:
import pandas as pd

In [3]:
raisin = pd.read_csv('Raisin_Dataset.csv', header=0)

In [18]:
def computeMean(array):
    numRows = len(array)
    numCols = len(array.columns)
    
    colSums = [0] * numCols
    
    for row in array:
        for i in range(numCols):
            colSums[i] += row[i]
    
    means = [sum / numRows for sum in colSums]
    
    return means

array = [[1, 2, 3],
        [4, 5, 6],
        [7, 8, 9]]

datasetMean = computeMean(array)
print("Mean values:", datasetMean)

AttributeError: 'list' object has no attribute 'columns'

In [5]:
def computeSampleCovariance(x, y):
    xLength = len(x)
    
    xMean = sum(x) / xLength
    yMean = sum(y) / xLength
    
    covariance = 0
    for i in range(xLength):
        covariance += (x[i] - xMean) * (y[i] - yMean)
    
    covariance /= (xLength - 1)
    
    return covariance

x = [1, 2, 3, 4, 5]
y = [5, 4, 3, 2, 1]

covariance = computeSampleCovariance(x, y)
print("Sample covariance:", covariance)

Sample covariance: -2.5


In [6]:
def computeCorrelation(x, y):
    xLength = len(x)
    
    xMean = sum(x) / xLength
    yMean = sum(y) / xLength
    
    covariance = 0
    for i in range(xLength):
        covariance += (x[i] - xMean) * (y[i] - yMean)
    covariance /= (xLength - 1)
    
    xStdDev = (sum((xi - xMean) ** 2 for xi in x) / (xLength - 1)) ** 0.5
    yStdDev = (sum((yi - yMean) ** 2 for yi in y) / (xLength - 1)) ** 0.5
    
    correlation = covariance / (xStdDev * yStdDev)
    
    return correlation

x = [1, 2, 3, 4, 5]
y = [5, 4, 3, 2, 1]

correlation = computeCorrelation(x, y)
print("Correlation:", correlation)

Correlation: -0.9999999999999998


In [7]:
def rangeNormalize(data):
    minValues = [min(col) for col in zip(*data)]
    maxValues = [max(col) for col in zip(*data)]
    
    normalizedData = []
    for row in data:
        normalizedRow = []
        for val, minVal, maxVal in zip(row, minValues, maxValues):
            normalizedVal = (val - minVal) / (maxVal - minVal)
            normalizedRow.append(normalizedVal)
        normalizedData.append(normalizedRow)
    
    return normalizedData

data = [[1, 2, 3],
        [4, 5, 6],
        [7, 8, 9]]

normalizedData = rangeNormalize(data)
print("Normalized data:")
for row in normalizedData:
    print(row)

Normalized data:
[0.0, 0.0, 0.0]
[0.5, 0.5, 0.5]
[1.0, 1.0, 1.0]


In [8]:
def standardNormalize(data):
    numRows = len(data)
    numCols = len(data[0])
    
    means = [sum(col) / numRows for col in zip(*data)]
    stdDevs = [(sum((val - means[i]) ** 2 for val in col) / numRows) ** 0.5 for i, col in enumerate(zip(*data))]
    
    normalizedData = []
    for row in data:
        normalizedRow = []
        for val, mean, stdDev in zip(row, means, stdDevs):
            normalizedVal = (val - mean) / stdDev if stdDev != 0 else val
            normalizedRow.append(normalizedVal)
        normalizedData.append(normalizedRow)
    
    return normalizedData

data = [[1, 2, 3],
        [4, 5, 6],
        [7, 8, 9]]

normalizedData = standardNormalize(data)
print("Normalized data:")
for row in normalizedData:
    print(row)


Normalized data:
[-1.2247448713915892, -1.2247448713915892, -1.2247448713915892]
[0.0, 0.0, 0.0]
[1.2247448713915892, 1.2247448713915892, 1.2247448713915892]


In [9]:
def computeCovarianceMatrix(array):
    numRows = len(array)
    numCols = len(array[0])
    
    means = [sum(col) / numRows for col in zip(*data)]
    
    covarianceMatrix = [[0] * numCols for _ in range(numCols)]
    
    for i in range(numCols):
        for j in range(numCols):
            covariance = 0
            for k in range(numRows):
                covariance += (array[k][i] - means[i]) * (array[k][j] - means[j])
            covariance /= (numRows - 1)
            covarianceMatrix[i][j] = covariance
    
    return covarianceMatrix

array = [[1, 2, 3],
        [4, 5, 6],
        [7, 8, 9]]

covarianceMatrix = computeCovarianceMatrix(array)
print("Covariance matrix:")
for row in covarianceMatrix:
    print(row)

Covariance matrix:
[9.0, 9.0, 9.0]
[9.0, 9.0, 9.0]
[9.0, 9.0, 9.0]


In [10]:
def labelEncode(array):
    uniqueCategories = set()
    
    # create set of unique values present in a column
    for item in array:
        uniqueCategories.add(item)
    
    # enumerate values in the set, assigning each value to an integer equivalent
    categoryToLabel = {category: i for i, category in enumerate(uniqueCategories)}
    
    # replace each column value in the dataset with its integer equivalent
    print(len(array))
    for i in range(len(array)):
        array[i] = categoryToLabel[item]

In [11]:
# ignore warning regarding chained indexing, that is not what is happening;
# we want to modify the df directly
pd.options.mode.chained_assignment = None
labelEncode(raisin['Class'])

900


In [13]:
# there are no missing values in our dataset
raisin.isnull().sum()

Area               0
MajorAxisLength    0
MinorAxisLength    0
Eccentricity       0
ConvexArea         0
Extent             0
Perimeter          0
Class              0
dtype: int64

In [15]:
computeMean(raisin)

KeyError: 0