# Preparing data

#### General imports

In [None]:
import pandas as pd
import os
import numpy as np
import random
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from matplotlib import style
import matplotlib.pyplot as plt
from matplotlib.collections import LineCollection
from sklearn import tree
from sklearn.ensemble import RandomForestRegressor

#### Reading users data from multiple files

In [None]:
path = r"data/procedure/"

# Reading data from multiple files
files = []
# r=root, d=directories, f = files
for r, d, f in os.walk(path):
    for file in f:
        if '.txt' in file:
            files.append(os.path.join(r, file))

# Filtering files
files = filter(lambda file: not file.endswith('_info.txt'), files)
files = filter(lambda file: os.stat(file).st_size > 0, files)

datas = []

for f in files:
    datas.append(pd.read_csv(f, sep='\t', names=["Id","Index","Consistence","Stimulant","SoundId","ImageId","Widget","Result","Time","Timestamp"]))
    
data = pd.concat(datas, sort=False)

#### Filtering data for data (inc) and data_con

In [None]:
def stringListToValues(result):
    result = result.strip('[]')
    values = (result.split(','))
    return [float(values[0].strip()), float(values[1].strip())]

# def discretization(val):
#     if val < -0.33:
#         return 1
#     elif val < 0.33:
#         return 5
#     else:
#         return 9

# Emospace1
data = data[data['Widget'] == 'emospace1']

# Getting valence and arousal to different columns
data['Result'] = data.Result.apply(func = stringListToValues)
data[['valence','arousal']] = pd.DataFrame(data.Result.values.tolist(), index = data.index)

#Getting consistence data before scaling
data_con = data[data['Consistence'] == 'con']
data_con = data_con[['valence', 'arousal', 'SoundId', 'ImageId', 'Stimulant']]

# Scaling valence and arousal
data['valence'] = data['valence'].apply(lambda it : (it * 4) + 5);
data['arousal'] = data['arousal'].apply(lambda it : (it * 4) + 5);

# Scaling with discretization valence and arousal
# data['valence'] = data['valence'].apply(lambda it : round((it * 4) + 5, 2));
# data['arousal'] = data['arousal'].apply(lambda it : round((it * 4) + 5, 2));

# discretization valence - does not seem to make much difference
# data['valence'] = data['valence'].apply(func = discretization)
# data['arousal'] = data['arousal'].apply(func = discretization)

# Data con to different variable
data_minus = data[data['Stimulant'] == 'p-s-']
data = data[data['Consistence'] == 'inc']
data_with_minus = pd.concat([data_minus, data])

# Getting only data we want
data = data[['Id','Index','valence', 'arousal', 'SoundId', 'ImageId', 'Stimulant']]
data_with_minus = data_with_minus[['Id','Index','valence', 'arousal', 'SoundId', 'ImageId', 'Stimulant']]

#### Joining data for image and sound

In [None]:
# Reading data frames for pictures and sounds
df_picture = pd.read_csv('IAPS.csv', sep=';')
df_sound = pd.read_csv('IADS2.csv', sep=';')

# Now we do not take into consideration weird Picture Ids (with comma)
df_picture = df_picture[df_picture['IAPS'].str.find(',') == -1]

# Changing type - required for merge
df_picture = df_picture.astype({'IAPS' : 'int64'})

# Merging
df_merge = pd.merge(data, df_picture, left_on='ImageId', right_on='IAPS', how='inner') # Brak dopasowań po usunięciu elementów z przecinkami w indeksie 
df_final = pd.merge(df_merge, df_sound, left_on='SoundId', right_on='Number', how='inner')

# Renaming
df_final = df_final.rename(index=str, columns={
    "ValenceMean_x":"ValMeanPic",
    "ValenceMean_y":"ValMeanSound",
    'ValenceSD_x':'ValSDPic',
    'ValenceSD_y':'ValSDSound',
    'ArousalMean_x':'ArMeanPic',
    'ArousalMean_y':'ArMeanSound',
    'ArousalSD_x':'ArSDPic',
    'ArousalSD_y':'ArSDSound'
})

# Typing
df_final = df_final.applymap(lambda s : s.replace(',', '.') if isinstance(s, str) else s);
df_final = df_final.astype({
    'ValSDPic' : 'float64',
    'ArSDPic' : 'float64',
    'arousal' : 'float64',
    'valence' : 'float64',
    'ValMeanPic' : 'float64',
    'ArMeanPic' : 'float64'
})

# Deleting NaN values if exists - no occured
df_final.dropna(inplace=True)

# Printing sample rows from final data frame
print("############################# df_final.head() ##############################\n")
print(df_final.head())
print("\ndf_finale.len = ",len(df_merge))

# ML

#### Test Regresji linionwej na dobrych danych 

In [None]:
test_data = [[200,100,20]]

for i in range(100):
    test_data.append([i, (i*2), (i*5)])
    test_data.append([i*800, (i*200), (i*3)])

df_test = pd.DataFrame(test_data, columns = ['x', 'y', 'z'])

X = np.array(df_test[['x', 'y']])
y = np.array(df_test['z'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

clf = LinearRegression()
clf.fit(X_train, y_train)
accuracy = clf.score(X_test, y_test)

print("Linear regression accuracy:", accuracy)

#### Choosing and spliting data

In [None]:
df_final.drop_duplicates(subset=['Id','Index'],keep=False,inplace=True)

# Comments : less features - better results
#          : reducing accuracy worsens results - the only improved case is Decision Tree with an accuracy of 1/10
X = np.array(df_final[['ValMeanPic' , 'ValMeanSound', 'ArMeanPic', 'ArMeanSound']])
y = np.array(df_final[['valence']])

#### Model regresji linowej

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

clf = LinearRegression()
clf.fit(X_train, y_train)

accuracy = clf.score(X_test, y_test)

print("Linear regression accuracy:", accuracy)

#### Regresja - drzewo decyzyjne

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

clf = tree.DecisionTreeRegressor()
clf.fit(X_train, y_train)

accuracy = clf.score(X_test, y_test)

print("Regression - decision tree accuracy:", accuracy)

#### RandomForestRegressor

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

clf = RandomForestRegressor(n_estimators=20, random_state=0)
clf.fit(X_train, y_train.ravel())

accuracy = clf.score(X_test, y_test)

print("RandomForestRegressor accuracy:", accuracy)

#### SVR

In [None]:
clf = SVR(kernel='linear')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
clf.fit(X_train, y_train)

accuracy = clf.score(X_test, y_test)

print("SVR accuracy:", accuracy)

#### Linear regression for different feaures

In [None]:
features = [['ArMeanPic', 'arousal'], ['ArMeanSound', 'arousal'], ['ValMeanPic', 'valence'], ['ValMeanSound', 'valence']]

for feature in features:
    X_local = np.array(df_final[feature[0]])
    y_local = np.array(df_final[feature[1]])

    lr = LinearRegression()
    lr.fit(X_local[:, np.newaxis], y_local)

    segments = [[[i, y_local[i]]] for i in range(len(X_local))]
    lc = LineCollection(segments, zorder=0)
    lc.set_array(np.ones(len(y_local)))
    lc.set_linewidths(np.full(len(X_local), 0.5))

    fig = plt.figure()
    plt.plot(X_local, y_local, 'r.', markersize=5)

    plt.plot(X_local, lr.predict(X_local[:, np.newaxis]), 'b-')
    plt.gca().add_collection(lc)
    plt.legend(('Data', 'Linear Fit'), loc='lower right')
    plt.xlabel(feature[0])
    plt.ylabel(feature[1])
    plt.show()

# Statystyka

#### Checking if every image and sound are present in only one examined, inconsistent pair

In [None]:
#sorted by soundIds
pair = data.sort_values(by=['SoundId']) 

soundids = pd.DataFrame.copy(data)
imageids = pd.DataFrame.copy(data)

#pair -> image + sound
pair.drop_duplicates(subset=['SoundId','ImageId'],keep='first',inplace=True)
soundids.drop_duplicates(subset='SoundId',keep='first',inplace=True)
imageids.drop_duplicates(subset='ImageId',keep='first',inplace=True)

#count uniqe pairs and incentives
print("Pair count = ", len(pair), "\nSounds count = ", len(soundids), "\nImages count = ", len(imageids))

# if each incentive is in at least one pair, and the number of pairs is equal to the number of incentives,
# that means each incentive exists in only one pair - we can identify pair by incentive id

#### Counting means and sds for data from survey

In [None]:
def plot_bar(names,values,title):
    index = np.arange(len(names))
    plt.bar(index, values,color = [(0.2, 0.3, 0.6, 0.5),(0.1, 0.4, 0.6, 0.6),(0.1, 0.5, 0.7, 0.8)])
    plt.xlabel('Kind', fontsize = 11)
    plt.ylabel('Value', fontsize = 11)
    plt.xticks(index, names, fontsize = 9, rotation = 25)
    plt.title('Comparing '+title)
    plt.show()

# get DF with pairs -> image+sound comparing
pairs = pd.DataFrame.copy(df_final)
#sort by soundIds
pairs = pairs.sort_values(by=['SoundId']) 

####################### Mean #########################

# average v for each pair
valenceAv =pairs.groupby('SoundId')['valence'].mean()
valenceAv = dict(valenceAv)

# average a for each pair
arousalAv =pairs.groupby('SoundId')['arousal'].mean()
arousalAv = dict(arousalAv)

# list of sound Ids for random selection
soundId_list = list(arousalAv.keys())
soundId = random.choice(soundId_list)

# example for random test pair
df_stat = pd.DataFrame.copy(df_final[['SoundId' , 'ImageId', 'ValMeanPic' , 'ValMeanSound', 'ArMeanPic', 'ArMeanSound']])
df_stat.drop_duplicates(subset='SoundId',keep='first',inplace=True)
df_stat = df_stat.loc[df_stat['SoundId'] == soundId]
df_stat['ArMeanPair'] = arousalAv[soundId]
df_stat['ValMeanPair'] = valenceAv[soundId]

df_stat = df_stat[['ValMeanPic' , 'ValMeanSound', 'ValMeanPair', 'ArMeanPic', 'ArMeanSound', 'ArMeanPair']]
means_names = ['ValMeanPic' , 'ValMeanSound', 'ValMeanPair', 'ArMeanPic', 'ArMeanSound', 'ArMeanPair']
means_list = list(df_stat.values.tolist()[0])

plot_bar(means_names,means_list,'Means')

######################## SD #########################

# std v for each pair
valenceSd =pairs.groupby('SoundId')['valence'].std()

# std a for each pair
arousalSd =pairs.groupby('SoundId')['arousal'].std()

# prepare SD date for plot purpose
df_sd = pd.DataFrame.copy(df_final[['SoundId' , 'ImageId', 'ValSDPic' , 'ValSDSound', 'ArSDPic', 'ArSDSound']])
df_sd.drop_duplicates(subset='SoundId',keep='first',inplace=True)
df_sd = df_sd.loc[df_sd['SoundId'] == soundId]
df_sd['ArSDPair'] = arousalSd[soundId]
df_sd['ValSDPair'] = valenceSd[soundId]
df_sd = df_sd[['ValSDPic' , 'ValSDSound', 'ValSDPair', 'ArSDPic', 'ArSDSound', 'ArSDPair']].copy()

sd_names = ['ValSDPic' , 'ValSDSound', 'ValSDPair', 'ArSDPic', 'ArSDSound', 'ArSDPair']
sd_list = list(df_sd.values.tolist()[0])
plot_bar(sd_names,sd_list,'SD')


####  Checking % of data, where negative image/sound made bigger influence for result

In [None]:
pic = data[data['Stimulant'] == 'p+s-']
sou = data[data['Stimulant'] == 'p-s+']

# get row with negative influence
pic_neg = pic[pic['valence'] < 5]
sou_neg = sou[sou['valence'] < 5]

print("Negative feelings/negative picture appeared:", round(len(sou_neg) / len(sou), 2))
print("Negative feelings/negative sound appeared:", round(len(pic_neg) / len(pic), 2))

#### Checking % of data, where at least one negative image/sound made bigger influence for result

In [None]:
have_minus_neg = data_with_minus[data_with_minus['valence'] < 5]
print("Negative feelings/negative appeared:", round(len(have_minus_neg) / len(data_with_minus), 2))

#### Checking if data is correct for consistence stimulants

In [None]:
data_con = data_con.applymap(lambda s : s.replace(',', '.') if isinstance(s, str) else s);
data_con = data_con.astype({'arousal' : 'float64','valence' : 'float64'})

negative = data_con[data_con['Stimulant'] == 'p-s-']
neutral = data_con[data_con['Stimulant'] == 'p0s0']
positive = data_con[data_con['Stimulant'] == 'p+s+']

# Epislon = 0.15
negative_con = negative[negative['valence'] < -0.15]
neutral_con = neutral[neutral['valence'] < 0.15]
neutral_con = neutral_con[neutral_con['valence'] > -0.15]
positive_con = positive[positive['valence'] > 0.15]

neg_perc = len(negative_con) / len(negative)
neu_perc = len(neutral_con) / len(neutral)
pos_perc = len(positive_con) / len(positive)

print("p-s-: ",neg_perc, "\np0s0: ", neu_perc, "\np+s+: ", pos_perc)