In [9]:
import pandas as pd
from scipy.sparse import hstack
from sklearn.linear_model import Ridge
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [5]:
data = pd.read_csv("salary-train.csv")
data.head()

Unnamed: 0,FullDescription,LocationNormalized,ContractTime,SalaryNormalized
0,International Sales Manager London ****k ****...,London,permanent,33000
1,An ideal opportunity for an individual that ha...,London,permanent,50000
2,Online Content and Brand Manager// Luxury Reta...,South East London,permanent,40000
3,A great local marketleader is seeking a perman...,Dereham,permanent,22500
4,Registered Nurse / RGN Nursing Home for Young...,Sutton Coldfield,,20355


In [14]:
def text_transform(text: pd.Series) -> pd.Series:
    return text.str.lower().replace("[^a-zA-Z0-9]", " ", regex=True)

In [7]:
data.head()

Unnamed: 0,FullDescription,LocationNormalized,ContractTime,SalaryNormalized
0,International Sales Manager London ****k ****...,London,permanent,33000
1,An ideal opportunity for an individual that ha...,London,permanent,50000
2,Online Content and Brand Manager// Luxury Reta...,South East London,permanent,40000
3,A great local marketleader is seeking a perman...,Dereham,permanent,22500
4,Registered Nurse / RGN Nursing Home for Young...,Sutton Coldfield,,20355


In [19]:
vectorizer = TfidfVectorizer(min_df = 5)
X_train_text = vectorizer.fit_transform(text_transform(data["FullDescription"]))

In [17]:
data['LocationNormalized'].fillna('nan', inplace=True)
data['ContractTime'].fillna('nan', inplace=True)

In [28]:
dic =  DictVectorizer()
X_train_dic = dic.fit_transform(data[["LocationNormalized", "ContractTime"]].to_dict("records"))

In [32]:
X_train = hstack([X_train_text, X_train_dic])

In [34]:
y_train = data["SalaryNormalized"]

In [35]:
lnr = Ridge(alpha = 1, random_state = 241)
lnr.fit(X_train, y_train)

Ridge(alpha=1, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=241, solver='auto', tol=0.001)

In [37]:
data_test = pd.read_csv("salary-test-mini.csv")

In [38]:
data_test

Unnamed: 0,FullDescription,LocationNormalized,ContractTime,SalaryNormalized
0,We currently have a vacancy for an HR Project ...,Milton Keynes,contract,
1,A Web developer opportunity has arisen with an...,Manchester,permanent,


In [43]:
X_text_test = vectorizer.transform(text_transform(data_test["FullDescription"]))
X_dic_test = dic.transform(data_test[["LocationNormalized","ContractTime"]].to_dict("records"))
X_test = hstack([X_text_test, X_dic_test])

In [44]:
y_test = lnr.predict(X_test)

In [46]:
for i in y_test:
    print("%.2f" % i)

56555.62
37188.32


In [52]:
from sklearn.decomposition import PCA
data = pd.read_csv("close_prices.csv")
data.head()

Unnamed: 0,date,AXP,BA,CAT,CSCO,CVX,DD,DIS,GE,GS,...,PFE,PG,T,TRV,UNH,UTX,V,VZ,WMT,XOM
0,2013-09-23,76.440002,117.510002,85.029999,24.27,125.519997,59.409999,64.75,24.280001,165.25,...,28.799999,79.279999,34.220001,86.379997,71.82,109.419998,196.240005,47.98,76.419998,87.75
1,2013-09-24,76.07,119.0,85.110001,24.139999,124.489998,59.319997,64.32,24.32,162.970001,...,28.709999,78.620003,34.09,85.870003,72.32,110.0,193.339996,47.27,75.75,87.360001
2,2013-09-25,75.989998,118.510002,84.5,24.43,124.07,59.319997,64.449997,24.23,162.309998,...,28.49,77.720001,34.049999,85.980003,71.980003,109.260002,191.559998,46.950001,74.650002,87.139999
3,2013-09-26,76.32,119.379997,84.199997,23.77,123.489998,59.509996,65.239998,24.25,162.289993,...,28.52,78.050003,34.23,85.830002,72.160004,109.660004,193.559998,47.669998,74.620003,87.07
4,2013-09-27,75.889999,118.739998,83.800003,23.33,122.639999,59.009995,65.190002,24.049999,159.850006,...,28.879999,77.209999,33.98,85.410004,71.989998,109.360001,193.050003,47.0,74.360001,86.900002


In [53]:
pc = PCA(n_components = 10)

In [58]:
X = data.iloc[:, 1:]
pc.fit(X)

PCA(copy=True, iterated_power='auto', n_components=10, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [61]:
sum_var = 0
for i, v in enumerate(pc.explained_variance_ratio_):
    sum_var += v
    if sum_var >= 0.9:
        break
pc.explained_variance_ratio_    

array([0.73897118, 0.11007169, 0.04995088, 0.0287492 , 0.02215448,
       0.01931577, 0.00674853, 0.00614091, 0.00320594, 0.00305611])

In [87]:
print(i+1)

4



На осях написаны проценты объяснённой дисперсии – эти числа показывают, какая доля общего разброса точек приходится на каждую из новых координат.


In [64]:
X0 = pd.DataFrame(pc.transform(X))[0]
X0.head()

0   -50.902404
1   -52.846909
2   -54.614439
3   -52.600566
4   -52.370123
Name: 0, dtype: float64

In [65]:
indexes = pd.read_csv("djia_index.csv")
indexes.head()

Unnamed: 0,date,^DJI
0,2013-09-23,15401.379883
1,2013-09-24,15334.589844
2,2013-09-25,15273.259766
3,2013-09-26,15328.299805
4,2013-09-27,15258.240234


In [66]:
import numpy as np

In [69]:
corr = np.corrcoef(X0, indexes["^DJI"])
corr

array([[1.        , 0.90965222],
       [0.90965222, 1.        ]])

In [72]:
print("%.2f" % corr[0][1])

0.91


In [85]:
X.columns[4]

'CVX'

In [83]:
X.columns[np.argmax(pc.components_[0])]

'V'