# Fluency Rateを線形重回帰、lasso回帰

In [1]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import ElasticNet
from sklearn import linear_model, preprocessing
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler

In [2]:
import pandas as pd
import numpy as np
from pandas import Series,DataFrame
from statistics import mean
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
# import xlrd
data1 = pd.ExcelFile('main_data_1.xlsx')  #Quantity features and fluency score are in this excel file
df1 = data1.parse('Sheet1') #Explantion for each coloumn is in Sheet explantion
X = df1.filter(["ATCL_P","phonation_rate"])  #Select features used to do regression. ATCL_P: articulation rate by using Prrat
y = df1.filter(["FR"]) #Fluency rating
print(y)
print(X)

     FR
0   5.8
1   3.5
2   2.0
3   3.1
4   3.6
..  ...
95  9.0
96  9.0
97  9.0
98  9.0
99  9.0

[100 rows x 1 columns]
    ATCL_P  phonation_rate
0     3.30        0.785853
1     3.43        0.678720
2     3.31        0.465385
3     2.87        0.571048
4     4.02        0.637371
..     ...             ...
95    3.75        0.918424
96    4.99        0.690372
97    4.19        0.882544
98    4.44        0.791738
99    4.47        0.791246

[100 rows x 2 columns]


In [6]:
df1.head()

Unnamed: 0,ID,FG,FR,ATCL_P,dur (s),phn-tm (s),phonation_rate
0,101,1,5.8,3.3,15.41,12.11,0.785853
1,102,1,3.5,3.43,28.76,19.52,0.67872
2,103,1,2.0,3.31,23.4,10.89,0.465385
3,104,1,3.1,2.87,26.25,14.99,0.571048
4,105,1,3.6,4.02,26.17,16.68,0.637371


In [29]:
def average_maximum_possibility(post):
    return np.mean(np.max(post, axis=1))

In [30]:
def average_post(post):
    return np.mean(post, axis=0)

In [31]:
avg_max_pos = []
avg_post = []
x = []
y = []

for uttid in range(1, 101):
    post_file_path = 'post/dnn5b_pretrain-dbn_dnn_2000_noise_aug/clean_wsj/50/UTT{}.csv'.format(uttid)
    post_pd = pd.read_csv(post_file_path, header=None)
    post = post_pd.to_numpy()
    avg_max_pos.append(average_maximum_possibility(post))
    avg_post.append(average_post(post))

df1['average_maximum_possibility'] = avg_max_pos

In [32]:
df1.head()

Unnamed: 0,ID,FG,FR,ATCL_P,dur (s),phn-tm (s),phonation_rate,maximum_possibility_penalties,average_maximum_possibility
0,101,1,5.8,3.3,15.41,12.11,0.785853,2.116858,0.790759
1,102,1,3.5,3.43,28.76,19.52,0.67872,2.177951,0.773674
2,103,1,2.0,3.31,23.4,10.89,0.465385,2.133309,0.779387
3,104,1,3.1,2.87,26.25,14.99,0.571048,2.144109,0.782529
4,105,1,3.6,4.02,26.17,16.68,0.637371,2.463123,0.755062


In [7]:
def elastic_cv(X,y,k,c):
    coef = []
    r2_train = []
    r2_test = []
    r2 = []
    all_pred = []
    all_y = []
    kf = KFold(n_splits = k, shuffle = True)
    clf = ElasticNet(alpha=0.1,l1_ratio=0.9)
    for train_index, test_index in kf.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        ss = StandardScaler()
        X_train = ss.fit_transform(X_train)
        X_test = ss.transform(X_test)
        clf.fit(X_train, y_train) 
        coef.append(clf.coef_)
        pred_test = clf.predict(X_test)
        r2_train.append(r2_score(y_train, clf.predict(X_train)))
        r2_test.append(r2_score(y_test, pred_test))
        all_pred.append(pred_test)
        all_y.append(y_test.values)
        
    coef_mean = []
    for j in range(c):
        coef_j = []
        for l in range(k):
            coef_j.append(coef[l][j])
        coef_mean.append(mean(coef_j))
    print("回帰変数", coef_mean)
    
    from scipy.stats import pearsonr
  
    a, b = pearsonr(np.ravel(all_pred), np.ravel(all_y))
    print("相関係数:", a)
    print("p値:", b)

    print('r2_test:',mean(r2_test)) #r2_test: 決定係数

In [8]:
elastic_cv(X,y,5,2)

回帰変数 [0.7634626984956112, 1.3354982691744919]
相関係数: 0.8191443559900353
p値: 2.1503861562734664e-25
r2_test: 0.6670081753647026
