In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import scale
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from scipy import stats
from IPython.display import display, HTML
data = pd.read_csv("wine.data", header=None)
data.columns

data.columns = ["V"+str(i) for i in range(1, len(data.columns)+1)] #rename column names to be similar to R naming convention
data.V1 = data.V1.astype(str)
X = data.loc[:, "V2":] # independent variables data
y = data.V1 # dependednt variable data

lda = LinearDiscriminantAnalysis().fit(X, y) 

def pretty_scalings(lda, X, out=False):
    ret = pd.DataFrame(lda.scalings_, index=X.columns, columns=["LD"+str(i+1) for i in range(lda.scalings_.shape[1])])
    if out:
        print("Coefficients of linear discriminants:")
        display(ret)
    return ret


pretty_scalings_ = pretty_scalings(lda, X, out=True)

Coefficients of linear discriminants:


Unnamed: 0,LD1,LD2
V2,0.4034,0.871793
V3,-0.165255,0.30538
V4,0.369075,2.34585
V5,-0.154798,-0.146381
V6,0.002163,-0.000463
V7,-0.618052,-0.032213
V8,1.661191,-0.491998
V9,1.495818,-1.630954
V10,-0.134093,-0.307088
V11,-0.355056,0.253231


In [7]:
lda.scalings_[:, 0] 

array([ 0.40339978, -0.1652546 ,  0.36907526, -0.15479789,  0.0021635 ,
       -0.61805207,  1.66119123,  1.49581844, -0.13409263, -0.35505571,
        0.81803607,  1.15755938,  0.00269121])

In [8]:
def calclda(variables, loadings):
    # find the number of samples in the data set and the number of variables
    numsamples, numvariables = variables.shape
    # make a vector to store the discriminant function
    ld = np.zeros(numsamples)
    # calculate the value of the discriminant function for each sample
    for i in range(numsamples):
        valuei = 0
        for j in range(numvariables):
            valueij = variables.iloc[i, j]
            loadingj = loadings[j]
            valuei = valuei + (valueij * loadingj)
        ld[i] = valuei
    # standardise the discriminant function so that its mean value is 0:
    ld = scale(ld, with_std=False)
    return ld

calclda(X, lda.scalings_[:, 0])

array([ 4.70024401,  4.30195811,  3.42071952,  4.20575366,  1.50998168,
        4.51868934,  4.52737794,  4.14834781,  3.86082876,  3.36662444,
        4.80587907,  3.42807646,  3.66610246,  5.58824635,  5.50131449,
        3.18475189,  3.28936988,  2.99809262,  5.24640372,  3.13653106,
        3.57747791,  1.69077135,  4.83515033,  3.09588961,  3.32164716,
        2.14482223,  3.9824285 ,  2.68591432,  3.56309464,  3.17301573,
        2.99626797,  3.56866244,  3.38506383,  3.5275375 ,  2.85190852,
        2.79411996,  2.75808511,  2.17734477,  3.02926382,  3.27105228,
        2.92065533,  2.23721062,  4.69972568,  1.23036133,  2.58203904,
        2.58312049,  3.88887889,  3.44975356,  2.34223331,  3.52062596,
        3.21840912,  4.38214896,  4.36311727,  3.51917293,  3.12277475,
        1.8024054 ,  2.87378754,  3.61690518,  3.73868551, -1.58618749,
       -0.79967216, -2.38015446,  0.45917726,  0.50726885, -0.39398359,
        0.92256616,  1.95549377,  0.34732815, -0.20371212,  0.24

In [9]:
def groupStandardise(variables, groupvariable):
    # find the number of samples in the data set and the number of variables
    numsamples, numvariables = variables.shape
    # find the variable names
    variablenames = variables.columns
    # calculate the group-standardised version of each variable
    variables_new = pd.DataFrame()
    for i in range(numvariables):
        variable_name = variablenames[i]
        variablei = variables[variable_name]
        variablei_Vw = calcWithinGroupsVariance(variablei, groupvariable)
        variablei_mean = np.mean(variablei)
        variablei_new = (variablei - variablei_mean)/(np.sqrt(variablei_Vw))
        variables_new[variable_name] = variablei_new
    return variables_new

groupStandardise(X, y)

NameError: name 'calcWithinGroupsVariance' is not defined

In [10]:
def rpredict(lda, X, y, out=False):
    ret = {"class": lda.predict(X), "posterior": pd.DataFrame(lda.predict_proba(X), columns=lda.classes_)}
    ret["x"] = pd.DataFrame(lda.fit_transform(X, y))
    ret["x"].columns = ["LD"+str(i+1) for i in range(ret["x"].shape[1])]
    if out:
        print("class")
        print(ret["class"])
        print()
        print("posterior")
        print(ret["posterior"])
        print()
        print("x")
        print(ret["x"])
    return ret

lda_values = rpredict(lda, standardisedX, y, True) 

NameError: name 'standardisedX' is not defined

In [11]:
def ldahist(data, g, sep=False):
    xmin = np.trunc(np.min(data)) - 1
    xmax = np.trunc(np.max(data)) + 1
    ncol = len(set(g))
    binwidth = 0.5
    bins=np.arange(xmin, xmax + binwidth, binwidth)
    if sep:
        fig, axl = plt.subplots(ncol, 1, sharey=True, sharex=True)
    else:
        fig, axl = plt.subplots(1, 1, sharey=True, sharex=True)
        axl = [axl]*ncol
    for ax, (group, gdata) in zip(axl, data.groupby(g)):
        sns.distplot(gdata.values, bins, ax=ax, label="group "+str(group))
        ax.set_xlim([xmin, xmax])
    if sep:
        ax.set_xlabel("group"+str(group))
    else:
        ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
        
    plt.tight_layout() 
    


In [12]:
ldahist(lda_values["x"].LD1, y) 

NameError: name 'lda_values' is not defined

In [13]:
ldahist(lda_values["x"].LD2, y) 

NameError: name 'lda_values' is not defined

In [14]:
sns.lmplot("LD1", "LD2", lda_values["x"].join(y), hue="V1", fit_reg=False)

NameError: name 'lda_values' is not defined

In [15]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn import decomposition
from sklearn import datasets

class PCA:
    
    def _init_(self,n_components):
        self.n_components=n_components
        self.components=None
        self.mean=None
        
    def fit(self, X):
        self.mean=np.mean(X,axis=0)
        X=X-self.mean
        cov=np.cov(X.T)
        eigenvalues,eigenvectors=np.linalg.eig(cov)
        eigenvectors=eigenvectors.T
        idxs=np.argsort(eigenvalues)[::-1]
        eigenvalues=eigenvalues[idxs]
        eigenvectors=eigenvectors[idxs] 
        self.components=eigenvectors[0:self.n_components]

    def transform(self,X):
        X=X-self.mean
        return np.dot(X,self.components.T)
