# Decision Tree Model

### Settings - Load libraries

In [1]:
import os
import pandas as pd
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation


### Settings - Directories and files

In [57]:
os.chdir("/Users/dumoura/Dev/PDev/Mashup_Vid_Processing/notebook")
cwd = os.getcwd()
print(cwd)

/Users/dumoura/Dev/PDev/Mashup_Vid_Processing/notebook


In [58]:
# Diretórios e locais de trabalho

#Base_dir
BASE_DIR = os.path.dirname(cwd) # base de trabalho
DATA_DIR = os.path.join(BASE_DIR, "data") # dados gerais levantados durante projeto
META_DIR = os.path.join(BASE_DIR, "metadados") #metadados levantados durante projeto
SAMPLE_DIR = os.path.join(DATA_DIR, "sample") # material em processo - pode ser apagado ao final, caso julgue necessário
INPUTS_DIR = os.path.join(SAMPLE_DIR, "inputs") # local de trabalho para processamento de dados de midia 
OUTPUTS_DIR = os.path.join(SAMPLE_DIR, "outputs") # local de trabalho para processamento de dados de midia 

#MASHUPS
VIDS_DIR = os.path.join(DATA_DIR, "vids") # dados gerais levantados durante projeto

#Inputs
VID_DIR = os.path.join(INPUTS_DIR, "vid_input") # local de trabalho para processamento de dados de midia 
AUDIO_DIR = os.path.join(INPUTS_DIR, "audio_input") # local de trabalho para processamento de dados de midia # # Criar diretórios e locais de trabalho
LGG_DIR = os.path.join(INPUTS_DIR, "lgg_input") # local de trabalho para processamento de dados de midia 

#Outputs
FRAME_DIR = os.path.join(OUTPUTS_DIR, "thumbnails") # local de trabalho para processamento de dados de midia 
SONG_DIR = os.path.join(OUTPUTS_DIR, "songs") # local de trabalho para processamento de dados de midia # # Criar diretórios e locais de trabalho
LYRIC_DIR = os.path.join(OUTPUTS_DIR, "lyrics") # local de trabalho para processamento de dados de midia 

#Make_dirs
os.makedirs(DATA_DIR, exist_ok=True)
os.makedirs(META_DIR, exist_ok=True)
os.makedirs(SAMPLE_DIR, exist_ok=True)
os.makedirs(INPUTS_DIR, exist_ok=True)
os.makedirs(OUTPUTS_DIR, exist_ok=True)

#Make_vids
os.makedirs(VIDS_DIR, exist_ok=True)

##Make_dirs inputs
os.makedirs(VID_DIR , exist_ok=True)
os.makedirs(AUDIO_DIR, exist_ok=True)
os.makedirs(LGG_DIR, exist_ok=True)

##Make_dirs outputs
os.makedirs(FRAME_DIR, exist_ok=True)
os.makedirs(SONG_DIR, exist_ok=True)
os.makedirs(LYRIC_DIR, exist_ok=True)

### loading document to train and set the decision tree model

In [59]:
os.chdir(META_DIR)
os.listdir()

['.Rhistory',
 '26_CarpentersOasis.csv',
 '.DS_Store',
 'MlKeywordsClassified.csv',
 '26_CarpentersOasis_ML_Pred.csv',
 'Vid24_WhiteStripesNirvana_ML_Tgd.csv',
 '26_CarpentersOasis_03.csv',
 '26_CarpentersOasis_02.csv',
 'Vid24_WhiteStripesNirvana_ManualTagged.csv',
 'Vid24_WhiteStripesNirvana_ML_Pred.csv']

In [62]:
# col_names to be use as dependent variable 

col_names = ['imageID', 'Keywords', 'brightness_median', 'brightness_stdev',
       'saturation_median', 'saturation_stdev', 'hue_median', 'hue_stdev',
       'Count', 'Total Area', 'Average Size', '%Area']

df = pd.read_csv('26_CarpentersOasis_02.csv', header=0, names=col_names)


In [63]:
df = df.dropna()

In [64]:
df.columns

Index(['imageID', 'Keywords', 'brightness_median', 'brightness_stdev',
       'saturation_median', 'saturation_stdev', 'hue_median', 'hue_stdev',
       'Count', 'Total Area', 'Average Size', '%Area'],
      dtype='object')

### Splitting Data

In [65]:
# divide given columns into two variables: dependent (or target variable) and independent (or feature variables)
feature_cols = ['imageID', 'brightness_median', 'brightness_stdev',
       'saturation_median', 'saturation_stdev', 'hue_median', 'hue_stdev',
       'Count', 'Total Area', 'Average Size', '%Area',]

X = df[feature_cols] #  dependent: feature variables
y = df.Keywords # independent: target variable



In [66]:
# Split dataset: training set and test set => 70% training and 30% test

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) 

### Decision Tree Model

In [71]:
# Create Decision Tree classifer object

clf = DecisionTreeClassifier(criterion = 'gini') # the the regular criterion is 'gini', it was changed to 'entropy'. Also, the max_depth may be set as well.

# Train Decision Tree Classifer
clf = clf.fit(X_train,y_train)
#Predict the response for test dataset
y_pred = clf.predict(X_test)

### Evaluating  - How often is the classifier correct? 

In [72]:
# Model Accuracy
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.8387096774193549


## Tagging

In [73]:
os.listdir()

['.Rhistory',
 '26_CarpentersOasis.csv',
 '.DS_Store',
 'MlKeywordsClassified.csv',
 '26_CarpentersOasis_ML_Pred.csv',
 'Vid24_WhiteStripesNirvana_ML_Tgd.csv',
 '26_CarpentersOasis_03.csv',
 '26_CarpentersOasis_02.csv',
 'Vid24_WhiteStripesNirvana_ManualTagged.csv',
 'Vid24_WhiteStripesNirvana_ML_Pred.csv']

In [77]:
# col_names = [ 'file', 'imageID', 'brightness_median', 'brightness_stdev',
#        'saturation_median', 'saturation_stdev', 'hue_median', 'hue_stdev',
#        'Count', 'Total Area', 'Average Size', '%Area']

col_names = ['imageID', 'brightness_median', 'brightness_stdev', 'saturation_median',
       'saturation_stdev', 'hue_median', 'hue_stdev', 'Count', 'Total Area',
       'Average Size', '%Area']

k_pred = pd.read_csv('26_CarpentersOasis_ML_Pred.csv', header=0, names=col_names)

In [78]:
k_pred.columns
k_pred = k_pred.dropna()

In [79]:
# k_pred = k_pred.drop(columns=['file'])
k_pred.head()

Unnamed: 0,imageID,brightness_median,brightness_stdev,saturation_median,saturation_stdev,hue_median,hue_stdev,Count,Total Area,Average Size,%Area
0,30,0,22.6181,0,48.9935,0,39.6775,4,15367,3841.75,12.555
1,33,0,80.7081,0,34.616,0,46.9229,8,16953,2119.125,13.85
2,36,0,80.6511,0,34.1051,0,46.4859,6,16914,2819.0,13.819
3,39,0,80.6809,0,33.3011,0,46.6411,6,16938,2823.0,13.838
4,42,0,80.707,0,32.9358,0,46.6317,8,16944,2118.0,13.843


In [80]:
# Create Decision Tree classifer object
clf = DecisionTreeClassifier()

# Train Decision Tree Classifer
clf = clf.fit(X_train, y_train)

#Predict the response for test dataset
y_pred = clf.predict(k_pred)

In [81]:
k = []

for i, row in enumerate(y_pred):
    k.append((row))
    
k = pd.DataFrame(k, columns=["Keywords"])

In [82]:
print(k)

                       Keywords
0     B_Instrumental_main-track
1     B_Instrumental_main-track
2     B_Instrumental_main-track
3     B_Instrumental_main-track
4     B_Instrumental_main-track
...                         ...
2045   Y_Special effects_titles
2046   Y_Special effects_titles
2047   Y_Special effects_titles
2048   Y_Special effects_titles
2049   Y_Special effects_titles

[2050 rows x 1 columns]


In [83]:
k.to_csv("MlKeywordsClassifiedOasys.csv")