# Fluency Rateを線形重回帰、lasso回帰

In [1]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import ElasticNet
from sklearn import linear_model, preprocessing
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler

In [2]:
import pandas as pd
import numpy as np
from pandas import Series,DataFrame
from statistics import mean
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
# import xlrd
data1 = pd.ExcelFile('main_data_1.xlsx')  #Quantity features and fluency score are in this excel file
df1 = data1.parse('Sheet1') #Explantion for each coloumn is in Sheet explantion
y = df1.filter(["FR"]) #Fluency rating

In [4]:
df1.head()

Unnamed: 0,ID,FG,FR,ATCL_P,dur (s),phn-tm (s),phonation_rate
0,101,1,5.8,3.3,15.41,12.11,0.785853
1,102,1,3.5,3.43,28.76,19.52,0.67872
2,103,1,2.0,3.31,23.4,10.89,0.465385
3,104,1,3.1,2.87,26.25,14.99,0.571048
4,105,1,3.6,4.02,26.17,16.68,0.637371


In [5]:
rates = y['FR'].to_numpy()
rates

array([5.8, 3.5, 2. , 3.1, 3.6, 2.7, 2. , 3.1, 2.5, 5.3, 3.7, 2.2, 2.2,
       3.3, 3.3, 3.3, 3.5, 1.6, 4.8, 6. , 3.8, 6.5, 4.3, 5.8, 7.2, 4.6,
       2.5, 5.8, 5.5, 5.8, 7.2, 4.6, 5.7, 4.7, 5.3, 6.8, 6. , 5.6, 7.6,
       5. , 3.7, 7.1, 8.2, 6.7, 6.7, 8.5, 6. , 7.1, 5.6, 8.4, 7. , 7.4,
       8.4, 6.4, 7.9, 6.8, 7.3, 7.5, 6.1, 6.9, 6.7, 5.4, 5.4, 5.4, 5.7,
       7.5, 5.8, 6.6, 8.1, 7.9, 7.7, 6.2, 8. , 4.5, 7.8, 7.1, 6.1, 7.6,
       8.4, 5.6, 1.8, 1.5, 2.6, 3.3, 2.5, 1.5, 2.7, 2.4, 2.9, 2.8, 8.5,
       8. , 8.8, 8.6, 8.6, 9. , 9. , 9. , 9. , 9. ])

In [6]:
def average_maximum_possibility(post):
    return np.mean(np.max(post, axis=1))

In [7]:
def average_post(post):
    return np.mean(post, axis=0)

In [8]:
avg_max_pos = []
avg_posts = []

for uttid in range(1, 101):
    post_file_path = 'post/dnn5b_pretrain-dbn_dnn_2000_noise_aug/clean_wsj/50/UTT{}.csv'.format(uttid)
    post_pd = pd.read_csv(post_file_path, header=None)
    post = post_pd.to_numpy()
    avg_max_pos.append(average_maximum_possibility(post))
    avg_posts.append(average_post(post))

df1['average_maximum_possibility'] = avg_max_pos
avg_posts = np.array(avg_posts)

In [9]:
def kl_divergence(p, q):
    return np.sum(np.where(p != 0, p * np.log(p / q), 0))

def kl_divergence_between_fluent(fluent_posts, target_posts):
    kl_divs = []
    for post in target_posts:
        kl_div = [kl_divergence(post, fluent_posts[i]) for i in range(len(fluent_posts))]
        kl_divs.append(kl_div)
    return np.array(kl_divs)

print('Fluent speakers: ', np.where(rates > 8.0))
fluent_posts = avg_posts[np.where(rates > 8.0)]

kl_divs_fluent = kl_divergence_between_fluent(fluent_posts, avg_posts)
kl_divs_fluent_mean = np.mean(kl_divs_fluent, axis=1)
# print(kl_divs_fluent_mean.shape)
kl_divs_fluent_mean

df1['average_KLdivergence_with_fluent'] = kl_divs_fluent_mean


Fluent speakers:  (array([42, 45, 49, 52, 68, 78, 90, 92, 93, 94, 95, 96, 97, 98, 99],
      dtype=int64),)


In [10]:
df1.head()

Unnamed: 0,ID,FG,FR,ATCL_P,dur (s),phn-tm (s),phonation_rate,average_maximum_possibility,average_KLdivergence_with_fluent
0,101,1,5.8,3.3,15.41,12.11,0.785853,0.790759,0.226444
1,102,1,3.5,3.43,28.76,19.52,0.67872,0.773674,0.341848
2,103,1,2.0,3.31,23.4,10.89,0.465385,0.779387,0.433767
3,104,1,3.1,2.87,26.25,14.99,0.571048,0.782529,0.289169
4,105,1,3.6,4.02,26.17,16.68,0.637371,0.755062,0.349627


In [11]:
X = df1.filter(["ATCL_P","phonation_rate", "average_maximum_possibility", "average_KLdivergence_with_fluent"])  #Select features used to do regression. ATCL_P: articulation rate by using Prrat
y = df1.filter(["FR"]) #Fluency rating

In [12]:
X.head()

Unnamed: 0,ATCL_P,phonation_rate,average_maximum_possibility,average_KLdivergence_with_fluent
0,3.3,0.785853,0.790759,0.226444
1,3.43,0.67872,0.773674,0.341848
2,3.31,0.465385,0.779387,0.433767
3,2.87,0.571048,0.782529,0.289169
4,4.02,0.637371,0.755062,0.349627


In [13]:
y.head()

Unnamed: 0,FR
0,5.8
1,3.5
2,2.0
3,3.1
4,3.6


In [14]:
def elastic_cv(X,y,k,c):
    coef = []
    r2_train = []
    r2_test = []
    r2 = []
    all_pred = []
    all_y = []
    kf = KFold(n_splits = k, shuffle = True)
    clf = ElasticNet(alpha=0.1,l1_ratio=0.9)
    for train_index, test_index in kf.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        ss = StandardScaler()
        X_train = ss.fit_transform(X_train)
        X_test = ss.transform(X_test)
        clf.fit(X_train, y_train) 
        coef.append(clf.coef_)
        pred_test = clf.predict(X_test)
        r2_train.append(r2_score(y_train, clf.predict(X_train)))
        r2_test.append(r2_score(y_test, pred_test))
        all_pred.append(pred_test)
        all_y.append(y_test.values)
        
    coef_mean = []
    for j in range(c):
        coef_j = []
        for l in range(k):
            coef_j.append(coef[l][j])
        coef_mean.append(mean(coef_j))
    print("回帰変数", coef_mean)
    
    from scipy.stats import pearsonr
  
    a, b = pearsonr(np.ravel(all_pred), np.ravel(all_y))
    print("相関係数:", a)
    print("p値:", b)

    print('r2_test:',mean(r2_test)) #r2_test: 決定係数

#### Original results

In [15]:
# elastic_cv(X,y,5,2)

#### My results

In [20]:
elastic_cv(X,y,5,4)

回帰変数 [0.5371886371310364, 0.676875800528906, 0.37616582864888165, -1.0527025543605808]
相関係数: 0.9184222039620783
p値: 2.9693847048956256e-41
r2_test: 0.8354719316861172
