In [25]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from imblearn.over_sampling import RandomOverSampler
from collections import Counter
# import os
# os.environ["PATH"] += os.pathsep + r'C:\Program Files (x86)\graphviz-2.38\release\bin'

In [26]:
df = pd.read_csv('SMT202 Arts Students Survey_cleaned.csv')
df.head()

Unnamed: 0,School,Course,Experience,SMU_convenience,Visits_to_SMU,Area_of_visit,Purpose_visit,Arts_scene_Sentiments,Problems_Lack of medium,Problems_Lack of funds/grants,Problems_Lack of exposure,Problems_Lack of mentorship,enough_exposure_localartists,enough_exposure_successfulartists,Interest_featureartworks,Interest_viewartworks,Interest_collaborative
0,SOTA,Dance,> 5 years,3,1,Others,Eat,growing,1,1,1,0,3,0,0,3,1
1,SOTA,Dance,> 5 years,3,4,T-Junction - the area outside Bras Basah MRT a...,Dance,"I feel like it is growing, at least for the da...",0,1,1,1,2,2,1,2,1
2,SOTA,Dance,> 5 years,3,4,T-Junction - the area outside Bras Basah MRT a...,Eat,very small and unknown,0,1,0,0,1,1,0,2,1
3,SOTA,Dance,> 5 years,2,1,Common Walkways,Hang out with friends,It’s developing. The arts scene currently is s...,0,0,1,0,2,3,0,3,1
4,SOTA,Dance,> 5 years,3,3,Others,Eat,The arts scene is Singapore is very small comp...,1,0,1,0,0,0,1,3,1


In [27]:
analyzer = SentimentIntensityAnalyzer()

df['Arts_scene_Sentiments'] = df['Arts_scene_Sentiments'].fillna('')
sentiment = df['Arts_scene_Sentiments'].apply(lambda x: analyzer.polarity_scores(x))
df = pd.concat([df,sentiment.apply(pd.Series)],1)

In [28]:
cat_cols = ['Area_of_visit','Experience','School','Course','Purpose_visit']
X = pd.get_dummies(df, columns=cat_cols).drop(['Arts_scene_Sentiments','Interest_featureartworks','Interest_viewartworks','Interest_collaborative'], axis=1) 
y1 = df['Interest_featureartworks']
y2 = df['Interest_collaborative']

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, y1, test_size=0.4, random_state=0,stratify = y1)

ros = RandomOverSampler(random_state=0,sampling_strategy=0.7)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

In [30]:
clf = LogisticRegression(random_state=0,penalty='l1')
clf.fit(X_resampled, y_resampled)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l1',
                   random_state=0, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [31]:
coeff_list = [[var,coeff] for var,coeff in zip(X.columns,clf.coef_[0])]
coeff_list.sort(key = lambda x : x[1], reverse=True)
for var,coeff in coeff_list:
    print("Variable:",var, "Coeff:",coeff)

Variable: Problems_Lack of exposure Coeff: 1.601950389187411
Variable: Experience_3-4 years Coeff: 0.9807894113769333
Variable: Visits_to_SMU Coeff: 0.005326536897815594
Variable: SMU_convenience Coeff: 0.0
Variable: Problems_Lack of medium Coeff: 0.0
Variable: Problems_Lack of funds/grants Coeff: 0.0
Variable: Problems_Lack of mentorship Coeff: 0.0
Variable: enough_exposure_successfulartists Coeff: 0.0
Variable: neg Coeff: 0.0
Variable: neu Coeff: 0.0
Variable: pos Coeff: 0.0
Variable: compound Coeff: 0.0
Variable: Area_of_visit_Campus Green - the green patch of the grass in the middle of the school Coeff: 0.0
Variable: Area_of_visit_Common Walkways Coeff: 0.0
Variable: Area_of_visit_Common Walkways,No, I haven’t been to SMU Coeff: 0.0
Variable: Area_of_visit_Koufu Coeff: 0.0
Variable: Area_of_visit_Koufu,Common Walkways Coeff: 0.0
Variable: Area_of_visit_No, I haven’t been to SMU Coeff: 0.0
Variable: Area_of_visit_Others Coeff: 0.0
Variable: Area_of_visit_SMU Connexion - the new buil

In [32]:
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.33      0.33      0.33         3
           1       0.78      0.78      0.78         9

    accuracy                           0.67        12
   macro avg       0.56      0.56      0.56        12
weighted avg       0.67      0.67      0.67        12



In [33]:
y_pred = clf.predict(X)
attendance, count = np.unique(y_pred, return_counts=True)

int_label = ['Not interested','Interested']
print("Total sample size:",len(y_pred))
for attendance, cnt in zip(attendance,count):
    print("Number of students "+int_label[attendance]+":",cnt)
    
print("Ratio of sample interested:",round(count[1]/len(y_pred),2))

Total sample size: 30
Number of students Not interested: 8
Number of students Interested: 22
Ratio of sample interested: 0.73


In [40]:
X_train, X_test, y_train, y_test = train_test_split(X, y2, test_size=0.7, random_state=0,stratify = y2)

ros = RandomOverSampler(random_state=1,sampling_strategy=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

clf = LogisticRegression(random_state=0,penalty='l1')
clf.fit(X_resampled, y_resampled)

coeff_list = [[var,coeff] for var,coeff in zip(X.columns,clf.coef_[0])]
coeff_list.sort(key = lambda x : x[1], reverse=True)
for var,coeff in coeff_list:
    print("Variable:",var, "Coeff:",coeff)
    
    
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

Variable: SMU_convenience Coeff: 1.5782382198792098
Variable: Visits_to_SMU Coeff: 0.0
Variable: Problems_Lack of medium Coeff: 0.0
Variable: Problems_Lack of funds/grants Coeff: 0.0
Variable: Problems_Lack of exposure Coeff: 0.0
Variable: Problems_Lack of mentorship Coeff: 0.0
Variable: enough_exposure_successfulartists Coeff: 0.0
Variable: neg Coeff: 0.0
Variable: neu Coeff: 0.0
Variable: pos Coeff: 0.0
Variable: compound Coeff: 0.0
Variable: Area_of_visit_Campus Green - the green patch of the grass in the middle of the school Coeff: 0.0
Variable: Area_of_visit_Common Walkways Coeff: 0.0
Variable: Area_of_visit_Common Walkways,No, I haven’t been to SMU Coeff: 0.0
Variable: Area_of_visit_Koufu Coeff: 0.0
Variable: Area_of_visit_Koufu,Common Walkways Coeff: 0.0
Variable: Area_of_visit_No, I haven’t been to SMU Coeff: 0.0
Variable: Area_of_visit_Others Coeff: 0.0
Variable: Area_of_visit_SMU Connexion - the new building beside National Museum of Singapore,T-Junction - the area outside Br



In [23]:
#### SInce y2 has higher score (More are interested in collaboration), we will choose collabration.

In [35]:
y_pred = clf.predict(X)
attendance, count = np.unique(y_pred, return_counts=True)

int_label = ['Not interested','Interested']
print("Total sample size:",len(y_pred))
for attendance, cnt in zip(attendance,count):
    print("Number of students "+int_label[attendance]+":",cnt)
    
print("Ratio of sample interested:",round(count[1]/len(y_pred),2))

Total sample size: 30
Number of students Not interested: 2
Number of students Interested: 28
Ratio of sample interested: 0.93
