In [15]:
import pandas as pd
import numpy as np
from random import shuffle
from numpy.linalg import inv
import matplotlib.pyplot as plt
from math import floor, log
import os

In [2]:
trainData = pd.read_csv("data/train.csv")
trainData
# 为了让 sex 这一列  male = 0, female = 1
data = trainData.drop(['sex', 'income'], axis = 1)
data
data['age']

0        39
1        50
2        38
3        53
4        28
         ..
32556    27
32557    40
32558    58
32559    22
32560    52
Name: age, Length: 32561, dtype: int64

In [4]:

listObjectColumn    = [col for col in data.columns if data[col].dtypes == "object"] #读取非数字的column
listNonObjedtColumn = [x for x in list(data) if x not in listObjectColumn] #数字的column
print(listObjectColumn)
print(listNonObjedtColumn)

['workclass', 'education', 'marital_status', 'occupation', 'relationship', 'race', 'native_country']
['age', 'fnlwgt', 'education_num', 'capital_gain', 'capital_loss', 'hours_per_week']


In [5]:
NonObjectData = data[listNonObjedtColumn]
NonObjectData

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week
0,39,77516,13,2174,0,40
1,50,83311,13,0,0,13
2,38,215646,9,0,0,40
3,53,234721,7,0,0,40
4,28,338409,13,0,0,40
...,...,...,...,...,...,...
32556,27,257302,12,0,0,38
32557,40,154374,9,0,0,40
32558,58,151910,9,0,0,40
32559,22,201490,9,0,0,20


In [6]:
ObjectData = data[listObjectColumn]
ObjectData

Unnamed: 0,workclass,education,marital_status,occupation,relationship,race,native_country
0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,United-States
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,United-States
2,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,United-States
3,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,United-States
4,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Cuba
...,...,...,...,...,...,...,...
32556,Private,Assoc-acdm,Married-civ-spouse,Tech-support,Wife,White,United-States
32557,Private,HS-grad,Married-civ-spouse,Machine-op-inspct,Husband,White,United-States
32558,Private,HS-grad,Widowed,Adm-clerical,Unmarried,White,United-States
32559,Private,HS-grad,Never-married,Adm-clerical,Own-child,White,United-States


In [7]:
(trainData.sex == ' Female').astype(np.int)

0        0
1        0
2        0
3        0
4        1
        ..
32556    1
32557    0
32558    1
32559    0
32560    1
Name: sex, Length: 32561, dtype: int64

In [8]:
ObjectData.insert(0, 'sex', (trainData.sex == ' Female').astype(np.int))
ObjectData

Unnamed: 0,sex,workclass,education,marital_status,occupation,relationship,race,native_country
0,0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,United-States
1,0,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,United-States
2,0,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,United-States
3,0,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,United-States
4,1,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Cuba
...,...,...,...,...,...,...,...,...
32556,1,Private,Assoc-acdm,Married-civ-spouse,Tech-support,Wife,White,United-States
32557,0,Private,HS-grad,Married-civ-spouse,Machine-op-inspct,Husband,White,United-States
32558,1,Private,HS-grad,Widowed,Adm-clerical,Unmarried,White,United-States
32559,0,Private,HS-grad,Never-married,Adm-clerical,Own-child,White,United-States


In [10]:
# one-hot 编码
ObjectData = pd.get_dummies(ObjectData)
ObjectData

Unnamed: 0,sex,workclass_ ?,workclass_ Federal-gov,workclass_ Local-gov,workclass_ Never-worked,workclass_ Private,workclass_ Self-emp-inc,workclass_ Self-emp-not-inc,workclass_ State-gov,workclass_ Without-pay,...,native_country_ Portugal,native_country_ Puerto-Rico,native_country_ Scotland,native_country_ South,native_country_ Taiwan,native_country_ Thailand,native_country_ Trinadad&Tobago,native_country_ United-States,native_country_ Vietnam,native_country_ Yugoslavia
0,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
1,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,1,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,1,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
32557,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
32558,1,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
32559,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [11]:
Data = pd.concat([NonObjectData, ObjectData], axis = 1) # 按列拼接，至于怎么看 axis 可以搜索知乎一篇回答
Data

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week,sex,workclass_ ?,workclass_ Federal-gov,workclass_ Local-gov,...,native_country_ Portugal,native_country_ Puerto-Rico,native_country_ Scotland,native_country_ South,native_country_ Taiwan,native_country_ Thailand,native_country_ Trinadad&Tobago,native_country_ United-States,native_country_ Vietnam,native_country_ Yugoslavia
0,39,77516,13,2174,0,40,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,50,83311,13,0,0,13,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,38,215646,9,0,0,40,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,53,234721,7,0,0,40,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,28,338409,13,0,0,40,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,257302,12,0,0,38,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
32557,40,154374,9,0,0,40,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
32558,58,151910,9,0,0,40,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
32559,22,201490,9,0,0,20,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [12]:
Data_x = Data.astype("int64")
Data_x = (Data_x - Data_x.mean()) / Data_x.std() # 归一化处理 Z-score标准化，符合正态分布 
Data_x = Data_x.drop('native_country_ Taiwan', axis = 1)
Data_x

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week,sex,workclass_ ?,workclass_ Federal-gov,workclass_ Local-gov,...,native_country_ Poland,native_country_ Portugal,native_country_ Puerto-Rico,native_country_ Scotland,native_country_ South,native_country_ Thailand,native_country_ Trinadad&Tobago,native_country_ United-States,native_country_ Vietnam,native_country_ Yugoslavia
0,0.030670,-1.063594,1.134721,0.148451,-0.216656,-0.035429,-0.703061,-0.244446,-0.174292,-0.262093,...,-0.042966,-0.033728,-0.059273,-0.019201,-0.049628,-0.023518,-0.024163,0.340949,-0.045408,-0.022172
1,0.837096,-1.008692,1.134721,-0.145918,-0.216656,-2.222119,-0.703061,-0.244446,-0.174292,-0.262093,...,-0.042966,-0.033728,-0.059273,-0.019201,-0.049628,-0.023518,-0.024163,0.340949,-0.045408,-0.022172
2,-0.042641,0.245075,-0.420053,-0.145918,-0.216656,-0.035429,-0.703061,-0.244446,-0.174292,-0.262093,...,-0.042966,-0.033728,-0.059273,-0.019201,-0.049628,-0.023518,-0.024163,0.340949,-0.045408,-0.022172
3,1.057031,0.425795,-1.197440,-0.145918,-0.216656,-0.035429,-0.703061,-0.244446,-0.174292,-0.262093,...,-0.042966,-0.033728,-0.059273,-0.019201,-0.049628,-0.023518,-0.024163,0.340949,-0.045408,-0.022172
4,-0.775756,1.408154,1.134721,-0.145918,-0.216656,-0.035429,1.422309,-0.244446,-0.174292,-0.262093,...,-0.042966,-0.033728,-0.059273,-0.019201,-0.049628,-0.023518,-0.024163,-2.932903,-0.045408,-0.022172
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,-0.849067,0.639731,0.746028,-0.145918,-0.216656,-0.197406,1.422309,-0.244446,-0.174292,-0.262093,...,-0.042966,-0.033728,-0.059273,-0.019201,-0.049628,-0.023518,-0.024163,0.340949,-0.045408,-0.022172
32557,0.103982,-0.335428,-0.420053,-0.145918,-0.216656,-0.035429,-0.703061,-0.244446,-0.174292,-0.262093,...,-0.042966,-0.033728,-0.059273,-0.019201,-0.049628,-0.023518,-0.024163,0.340949,-0.045408,-0.022172
32558,1.423588,-0.358772,-0.420053,-0.145918,-0.216656,-0.035429,1.422309,-0.244446,-0.174292,-0.262093,...,-0.042966,-0.033728,-0.059273,-0.019201,-0.049628,-0.023518,-0.024163,0.340949,-0.045408,-0.022172
32559,-1.215625,0.110958,-0.420053,-0.145918,-0.216656,-1.655199,-0.703061,-0.244446,-0.174292,-0.262093,...,-0.042966,-0.033728,-0.059273,-0.019201,-0.049628,-0.023518,-0.024163,0.340949,-0.045408,-0.022172


In [13]:
dd = np.zeros((4, 4))
np.transpose(Data_x.values[0]).shape
np.dot(np.transpose([Data_x.values[0] - 1]), [Data_x.values[0] - 1])
#https://www.cnblogs.com/terencezhou/p/6235974.html 协方差矩阵知识
#np.dot(np.transpose(Data_x.values[0] - 1), Data_x.values[0] - 1)

#[Data_x.values[0] - 1]

array([[ 0.93960048,  2.00030379, -0.13058942, ...,  0.63883816,
         1.01334492,  0.99082221],
       [ 2.00030379,  4.2584219 , -0.2780102 , ...,  1.36001462,
         2.15729741,  2.10934909],
       [-0.13058942, -0.2780102 ,  0.01814984, ..., -0.08878828,
        -0.14083872, -0.13770842],
       ...,
       [ 0.63883816,  1.36001462, -0.08878828, ...,  0.43434864,
         0.6889773 ,  0.67366402],
       [ 1.01334492,  2.15729741, -0.14083872, ...,  0.6889773 ,
         1.09287718,  1.06858678],
       [ 0.99082221,  2.10934909, -0.13770842, ...,  0.67366402,
         1.06858678,  1.04483626]])

In [22]:
new_df = pd.DataFrame(np.random.randint(1,3), index = [1, 2], columns = ['A', 'B'])
new_df

Unnamed: 0,A,B
1,2,2
2,2,2
