-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
9c2d0be
commit 229e885
Showing
16 changed files
with
5,394 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
## SQUAAD ANALYSIS FRAMEWORK | ||
|
||
## Dependencies | ||
|
||
Install R on Ubuntu: | ||
`sudo apt-get install r-base` | ||
|
||
Install the SQUAAD library | ||
`pip install dist/squaad-1.0.tar.gz` | ||
|
||
Examples in examples folder. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,97 @@ | ||
from squaad import ml | ||
import pandas as pd | ||
import numpy as np | ||
from squaad import db | ||
|
||
""" | ||
For the config file and format please look the repository for an example | ||
{"pgsql":{"host":"","user":"","passwd":"","db":""} } | ||
Support only for postgresql. Database must have the tables and fields of | ||
SQUAAD Database | ||
@config_file path to config file that contains the connection information | ||
@cache_folder path to the cache folder where to save the results | ||
""" | ||
myConnection=db("config.json","cache") | ||
|
||
dataset=myConnection.getQualityCompilation() | ||
df = pd.DataFrame(dataset) | ||
df = df.drop(df[df['impactful'] < 2].index) | ||
#create label | ||
df['affiliation']= np.where(((df["domain"] == 'google.com') & (df["organization"]=='google')) | | ||
((df["domain"] == 'apache.org') & (df["organization"]=='apache')) | | ||
((df["domain"] == 'netflix.com') & (df["organization"]=='netflix')) | ||
,df["organization"]+"-affiliated",df["organization"] +"-unaffiliated" ) | ||
|
||
metrics=['loc', 'cpx', 'sml', 'vul', 'fbg', 'locs_inc', 'cplxs_inc', 'smls_inc', 'vuls_inc', 'fbgs_inc', 'locs_dec', 'cplxs_dec', 'smls_dec', 'vuls_dec', 'fbgs_dec'] | ||
|
||
|
||
for metric in metrics: | ||
df[metric]=df[metric]/df['total'] | ||
df['analyzed']=df['analyzed']/df['impactful'] | ||
|
||
organizations=["google","apache","netflix"] | ||
scores_aff={} | ||
|
||
mlPipeline=ml(output=True, outputFolder="cache") | ||
|
||
for organization in organizations: | ||
df_aff=df[(df['affiliation'] == organization+'-affiliated') | (df['affiliation'] == organization+'-unaffiliated' ) ] | ||
X=df_aff[['locs_inc', 'cplxs_inc', 'smls_inc', 'vuls_inc', 'fbgs_inc', 'locs_dec', 'cplxs_dec', 'smls_dec', 'vuls_dec', 'fbgs_dec']] | ||
Y=df_aff['affiliation'] | ||
|
||
""" | ||
The goal of this function is to make it easy for someone with no experience in ML, to run a pipeline, | ||
based on the SQUAAD data. This is a very limited function, please consider understanding your data better | ||
and running and running the correct analysis, pre-processing. | ||
X = Pandas dataframe with set of data. | ||
Y = Labels for the set of data. | ||
split_columns unimplemented, columns to split by. That is columns that can have bias, we take into consideration during splitting | ||
kfolds=10, number of folds to run. | ||
classifiers={"Nearest Neighbors":KNeighborsClassifier(3), | ||
"Linear SVM": SVC(kernel="linear", C=0.025), | ||
"RBF SVM": SVC(gamma=2, C=1), | ||
"Gaussian Process": GaussianProcessClassifier(1.0 * RBF(1.0)), | ||
"Decision Tree" : DecisionTreeClassifier(max_depth=5), | ||
"Random Forest": RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1), | ||
"Neural Net": MLPClassifier(alpha=1), | ||
"AdaBoost": AdaBoostClassifier(), | ||
"Naive Bayes": GaussianNB(), | ||
"QDA":QuadraticDiscriminantAnalysis() | ||
}, | ||
the classifiers you want to run, at least one must be present | ||
balancers={ | ||
"Unbalanced":None, | ||
"SMOTE":SMOTE(), | ||
"SMOTEEN":SMOTEENN(), | ||
"SMOTETomek":SMOTETomek(), | ||
"RandomUnderSampler":RandomUnderSampler() | ||
} | ||
the balancers you want to run, at least one must be present. If not want to run any balancing, use only "Unbalanced":None, | ||
example: | ||
classificationPipeLineKfold(self, | ||
X,Y, split_columns=None, kfolds=10, | ||
classifiers={"Nearest Neighbors":KNeighborsClassifier(3), | ||
"Linear SVM": SVC(kernel="linear", C=0.025), | ||
"RBF SVM": SVC(gamma=2, C=1), | ||
"Gaussian Process": GaussianProcessClassifier(1.0 * RBF(1.0)), | ||
"Decision Tree" : DecisionTreeClassifier(max_depth=5), | ||
"Random Forest": RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1), | ||
"Neural Net": MLPClassifier(alpha=1), | ||
"AdaBoost": AdaBoostClassifier(), | ||
"Naive Bayes": GaussianNB(), | ||
"QDA":QuadraticDiscriminantAnalysis() | ||
}, | ||
balancers={ | ||
"Unbalanced":None, | ||
"SMOTE":SMOTE(), | ||
"SMOTEEN":SMOTEENN(), | ||
"SMOTETomek":SMOTETomek(), | ||
"RandomUnderSampler":RandomUnderSampler() | ||
} | ||
) | ||
""" | ||
mlPipeline.classificationPipeLineKfold(X,Y) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
from squaad import stats | ||
from squaad import file | ||
from squaad import db | ||
|
||
""" | ||
For the config file and format please look the repository for an example | ||
{"pgsql":{"host":"","user":"","passwd":"","db":""} } | ||
Support only for postgresql. Database must have the tables and fields of | ||
SQUAAD Database | ||
@config_file path to config file that contains the connection information | ||
@cache_folder path to the cache folder where to save the results | ||
""" | ||
myConnection=db("config.json","cache") | ||
|
||
stats=stats() | ||
|
||
affiliation=myConnection.getAffiliationCompilation() | ||
results={} | ||
|
||
groups={} | ||
for entry in affiliation: | ||
groups[entry[0]+"_"+entry[1]]={} | ||
groups[entry[0]+"_"+entry[1]][True]=int(entry[4]) | ||
groups[entry[0]+"_"+entry[1]][False]=int(entry[5])-int(entry[4]) | ||
|
||
|
||
results["comp"]=stats.gamesHowellBinomial(groups) | ||
|
||
affiliation=myConnection.getAffiliationQuality() | ||
#print(affiliation) | ||
metrics= {'locs', 'smls', 'cpxs', 'vuls', 'fbgs', 'locs_inc', 'smls_inc', 'cpxs_inc', 'vuls_inc', 'fbgs_inc', 'locs_dec', 'smls_dec', 'cpxs_dec', 'vuls_dec', 'fbgs_dec'} | ||
#CONVERT RESULTS TO ARRAY FOR PASSING TO STATS CHECK | ||
for metric in metrics: | ||
|
||
groups={} | ||
for entry in affiliation: | ||
groups[entry["organization"]+"_"+entry["status"]]={} | ||
groups[entry["organization"]+"_"+entry["status"]][True]=int(entry[metric]) | ||
groups[entry["organization"]+"_"+entry["status"]][False]=int(entry["total"])-int(entry[metric]) | ||
|
||
results[metric]=stats.gamesHowellBinomial(groups) | ||
|
||
file.saveResultsExcel(results,"stat.xls") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
from setuptools import setup | ||
|
||
setup(name='squaad', | ||
version='1.0', | ||
description='Helper functions for running queries, ml pipeline, statistical analysis on SQUAAD dataset', | ||
url='http://github.com/fostiropoulos', | ||
author='Iordanis Fostiropoulos', | ||
author_email='fostirop@usc.edu', | ||
license='MIT', | ||
packages=['squaad'], | ||
install_requires=[ | ||
'sklearn', | ||
'numpy', | ||
'seaborn', | ||
'matplotlib', | ||
'imblearn', | ||
'xlwt', | ||
'rpy2' | ||
], | ||
zip_safe=False) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
___all___=["db","stats","file","ml"] | ||
from .db import db | ||
from .stats import stats | ||
from .file import file | ||
from .ml import ml |
Oops, something went wrong.