In [1]:
import math
from scipy.io import arff
from scipy.stats.stats import pearsonr
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split

# Formatação mais bonita para os notebooks
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline
plt.style.use('fivethirtyeight')
plt.rcParams['figure.figsize'] = (15,5)


In [2]:
df_desharnais = pd.read_csv('02.desharnais.csv',  header=0)
df_desharnais.head()

Unnamed: 0,id,Project,TeamExp,ManagerExp,YearEnd,Length,Effort,Transactions,Entities,PointsNonAdjust,Adjustment,PointsAjust,Language
0,1,1,1,4,85,12,5152,253,52,305,34,302,1
1,2,2,0,0,86,4,5635,197,124,321,33,315,1
2,3,3,4,4,85,1,805,40,60,100,18,83,1
3,4,4,0,0,86,5,3829,200,119,319,30,303,1
4,5,5,0,0,86,4,2149,140,94,234,24,208,1


In [3]:
df_desharnais.shape

(81, 13)

In [15]:
df_desharnais.des

Unnamed: 0,id,Project,TeamExp,ManagerExp,YearEnd,Length,Effort,Transactions,Entities,PointsNonAdjust,Adjustment,PointsAjust,Language
count,81.0,81.0,81.0,81.0,81.0,81.0,81.0,81.0,81.0,81.0,81.0,81.0,81.0
mean,41.0,41.0,2.185185,2.530864,85.740741,11.666667,5046.308642,182.123457,122.333333,304.45679,27.62963,289.234568,1.555556
std,23.526581,23.526581,1.415195,1.643825,1.222475,7.424621,4418.767228,144.035098,84.882124,180.210159,10.591795,185.761088,0.707107
min,1.0,1.0,-1.0,-1.0,82.0,1.0,546.0,9.0,7.0,73.0,5.0,62.0,1.0
25%,21.0,21.0,1.0,1.0,85.0,6.0,2352.0,88.0,57.0,176.0,20.0,152.0,1.0
50%,41.0,41.0,2.0,3.0,86.0,10.0,3647.0,140.0,99.0,266.0,28.0,255.0,1.0
75%,61.0,61.0,4.0,4.0,87.0,14.0,5922.0,224.0,169.0,384.0,35.0,351.0,2.0
max,81.0,81.0,4.0,7.0,88.0,39.0,23940.0,886.0,387.0,1127.0,52.0,1116.0,3.0


In [16]:
df_desharnais.corr()

Unnamed: 0,id,Project,TeamExp,ManagerExp,YearEnd,Length,Effort,Transactions,Entities,PointsNonAdjust,Adjustment,PointsAjust,Language
id,1.0,1.0,-0.006007,0.214294,0.096486,0.255187,0.126153,0.265891,0.028787,0.226076,-0.207774,0.202608,0.391475
Project,1.0,1.0,-0.006007,0.214294,0.096486,0.255187,0.126153,0.265891,0.028787,0.226076,-0.207774,0.202608,0.391475
TeamExp,-0.006007,-0.006007,1.0,0.424687,-0.210335,0.143948,0.119529,0.103768,0.256608,0.203805,0.235629,0.222884,-0.079112
ManagerExp,0.214294,0.214294,0.424687,1.0,-0.011519,0.211324,0.158303,0.138146,0.206644,0.207748,-0.066821,0.187399,0.205521
YearEnd,0.096486,0.096486,-0.210335,-0.011519,1.0,-0.095027,-0.048367,0.034331,0.001686,0.028234,-0.056743,0.012106,0.342233
Length,0.255187,0.255187,0.143948,0.211324,-0.095027,1.0,0.69328,0.620711,0.483504,0.723849,0.266086,0.714092,-0.02381
Effort,0.126153,0.126153,0.119529,0.158303,-0.048367,0.69328,1.0,0.581881,0.510328,0.705449,0.463865,0.738271,-0.261942
Transactions,0.265891,0.265891,0.103768,0.138146,0.034331,0.620711,0.581881,1.0,0.185041,0.886419,0.341906,0.880923,0.136778
Entities,0.028787,0.028787,0.256608,0.206644,0.001686,0.483504,0.510328,0.185041,1.0,0.618913,0.234747,0.598401,-0.056439
PointsNonAdjust,0.226076,0.226076,0.203805,0.207748,0.028234,0.723849,0.705449,0.886419,0.618913,1.0,0.383842,0.985945,0.082737


In [17]:
features = [ 'TeamExp', 'ManagerExp', 'YearEnd', 'Length', 'Transactions', 'Entities',
        'PointsNonAdjust', 'Adjustment', 'PointsAjust']

max_corr_features = ['Length', 'Transactions', 'Entities','PointsNonAdjust','PointsAjust']

X = df_desharnais[features]
y = df_desharnais['Effort']

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=30)

neigh = KNeighborsRegressor(n_neighbors=3, weights='uniform')
neigh.fit(X_train, y_train) 
print(neigh.score(X_test, y_test))

0.7379861869550943


In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=22)

model = LinearRegression()
model.fit(X_train, y_train)
print(model.score(X_test, y_test))

0.7680074954440708


In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=22)

parameters = {'kernel':('linear', 'rbf'), 'C':[1,2,3,4,5,6,7,8,9,10], 'gamma':('auto', 'scale')}

svr = SVR()
LinearSVC = GridSearchCV(svr, parameters, cv=3)
LinearSVC.fit(X_train, y_train)
print("Best params hash: {}".format(LinearSVC.best_params_))
print(LinearSVC.score(X_test, y_test))

Best params hash: {'C': 1, 'gamma': 'auto', 'kernel': 'linear'}
0.735919788126071


In [6]:
from sklearn.ensemble import RandomForestClassifier

In [8]:
n_feature=8
MMRE_list=[]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=30)

In [9]:

for i in np.arange(1,9):
    print(i)
    clf= RandomForestClassifier(n_estimators=500,max_features=i)
    clf.fit(X_train, y_train)
    Y_pred=clf.predict(X_test)
    diff=np.absolute(y_test-Y_pred)
    MRE=diff/y_test
    
    
    MMRE=np.mean(MRE)
    print(MMRE)
    MMRE_list.append(MMRE)
    

1
0.9343220122551504
2
0.7879875742034235
3
0.7924933823573136
4
0.9220488099758818
5
0.8687146152375227
6
0.8281882765207041
7
0.8226902208370419
8
0.8298961829415655


In [56]:
MMRE_list

[0.8808049689886723,
 0.8774290705479009,
 0.8716455092087839,
 0.9482877428402647,
 0.889239752293223]

In [50]:
for i in np.arange(1,8):
    print(i)

1
2
3
4
5
6
7


In [10]:
clf.feature_importances_

array([0.18778194, 0.20791802, 0.23522704, 0.1839756 , 0.18509741])

In [18]:
X_test

Unnamed: 0,Length,Transactions,Entities,PointsNonAdjust,PointsAjust
32,9,174,78,252,267
73,6,213,73,286,203
10,21,167,99,266,237
70,6,97,42,139,99
8,12,172,88,260,247
64,8,194,97,291,291
51,6,86,49,135,131
55,13,45,387,432,350
24,8,89,200,289,283
44,18,182,126,308,308


In [27]:
Y_pred

array([ 4172,  6405,  9051,  1876,  9051,  6405,  1876,  4494,  3164,
       14434,   847,  2821,  4277,  2583,  3829,  2422,  2583,  9135,
        2422,  7252,  9135,  6405,  9051,  1617,  9520, 13860,  2422],
      dtype=int64)

In [36]:
print(y_test)

32     2429
73      595
10     4067
70      546
8      7854
64     3626
51     3136
55     8232
24     3983
44     6699
34      651
4      2149
29     3948
63     1603
1      5635
68     2548
56     3276
39     8050
48     2331
21     5180
66    11361
58     3472
74     3941
57     2723
20    14973
72     9100
19      840
Name: Effort, dtype: int64


In [39]:
MME=m/y_test

In [41]:
MME=np.mean(MME)

In [42]:
MME

0.9354248799774019

In [58]:
clf= RandomForestClassifier(n_estimators=500,max_features=5)
clf.fit(X_train, y_train)
Y_pred=clf.predict(X_test)
diff=np.absolute(y_test-Y_pred)
MRE=diff/y_test
    

In [70]:
P=MRE[MRE<.25]

In [76]:
Pred=(P.size/MRE.size) * 100

In [77]:
Pred

29.629629629629626

In [79]:
MdMRE=np.median(MRE)

In [80]:
MdMRE

0.4017857142857143

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=30)

In [19]:
X_train

Unnamed: 0,TeamExp,ManagerExp,YearEnd,Length,Transactions,Entities,PointsNonAdjust,Adjustment,PointsAjust
43,-1,4,86,39,284,230,514,50,591
42,1,1,88,10,64,54,118,25,106
30,4,3,86,6,79,128,207,27,190
26,2,0,86,6,71,235,306,37,312
47,4,3,85,11,131,180,311,51,361
5,0,0,86,4,97,89,186,38,192
25,4,1,85,14,86,230,316,33,310
22,2,4,86,5,306,132,438,37,447
59,1,2,87,6,47,32,79,14,62
9,3,4,83,4,78,38,116,24,103


In [21]:
from sklearn.ensemble import RandomForestClassifier

In [23]:
MMRE_list=[]
MdMRE_list=[]
Pred_list= []
for i in np.arange(1,10):   
    clf= RandomForestClassifier(n_estimators=100,max_features=i)
    clf.fit(X_train, y_train)
    Y_pred=clf.predict(X_test)
    diff=np.absolute(y_test-Y_pred)
    MRE=diff/y_test
    MMRE=np.mean(MRE)
    MdMRE=np.median(MRE)
    P=MRE[MRE<.25]
    Pred=(P.size/MRE.size) * 100
    MMRE_list.append(MMRE)
    MdMRE_list.append(MdMRE)
    Pred_list.append(Pred)

In [24]:
MMRE_list

[0.6474020978812322,
 0.6674598560270318,
 0.8330055982645084,
 0.8613630471647158,
 0.8182360918285473,
 0.9440200179484725,
 0.8613800198492472,
 0.8509151437250276,
 0.8327472052147136]

In [25]:
Pred_list

[33.33333333333333,
 22.22222222222222,
 22.22222222222222,
 22.22222222222222,
 22.22222222222222,
 22.22222222222222,
 14.814814814814813,
 22.22222222222222,
 18.51851851851852]

In [27]:
features = [ 'TeamExp', 'ManagerExp', 'YearEnd', 'Length', 'Transactions', 'Entities',
        'PointsNonAdjust', 'Adjustment', 'PointsAjust']

max_corr_features = ['Length', 'Transactions', 'Entities','PointsNonAdjust','PointsAjust']

X = df_desharnais[max_corr_features]
y = df_desharnais['Effort']

In [28]:
MMRE_list=[]
MdMRE_list=[]
Pred_list= []
for i in np.arange(1,10):   
    clf= RandomForestClassifier(n_estimators=100,max_features=i)
    clf.fit(X_train, y_train)
    Y_pred=clf.predict(X_test)
    diff=np.absolute(y_test-Y_pred)
    MRE=diff/y_test
    MMRE=np.mean(MRE)
    MdMRE=np.median(MRE)
    P=MRE[MRE<.25]
    Pred=(P.size/MRE.size) * 100
    MMRE_list.append(MMRE)
    MdMRE_list.append(MdMRE)
    Pred_list.append(Pred)

In [29]:
MMRE_list

[0.603497573190829,
 0.849789389763301,
 0.7096284121447558,
 0.6842445520935792,
 0.7629601614754336,
 0.9537515818352986,
 0.6612901766141701,
 0.8899580100050166,
 0.9011612343933936]

In [30]:
Pred_list

[25.925925925925924,
 14.814814814814813,
 18.51851851851852,
 29.629629629629626,
 29.629629629629626,
 25.925925925925924,
 14.814814814814813,
 18.51851851851852,
 18.51851851851852]

In [35]:
MMRE_list=[]
MdMRE_list=[]
Pred_list= []
for i in np.arange(1,6):   
    clf= RandomForestClassifier(n_estimators=100,max_features=i)
    clf.fit(X_train, y_train)
    Y_pred=clf.predict(X_test)
    diff=np.absolute(y_test-Y_pred)
    MRE=diff/y_test
    MMRE=np.mean(MRE)
    MdMRE=np.median(MRE)
    P=MRE[MRE<.25]
    Pred=(P.size/MRE.size) * 100
    MMRE_list.append(MMRE)
    MdMRE_list.append(MdMRE)
    Pred_list.append(Pred)
print(MMRE_list)
print(Pred_list)

[0.9303999267459266, 0.872243546415, 0.8387084322056961, 0.8195733507032912, 0.928392826639421]
[22.22222222222222, 22.22222222222222, 14.814814814814813, 22.22222222222222, 18.51851851851852]
