# Decision Tree Model

### Settings - Load libraries

In [103]:
import os
import pandas as pd
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation


### Settings - Directories and files

In [54]:
#os.chdir(".../notebook")
cwd = os.getcwd()
print(cwd)

/Users/dumoura/Dev/PDev/Mashup_Vid_Processing/notebook


In [55]:
# Diretórios e locais de trabalho

#Base_dir
BASE_DIR = os.path.dirname(cwd) # base de trabalho
DATA_DIR = os.path.join(BASE_DIR, "data") # dados gerais levantados durante projeto
META_DIR = os.path.join(BASE_DIR, "metadados") #metadados levantados durante projeto
SAMPLE_DIR = os.path.join(DATA_DIR, "sample") # material em processo - pode ser apagado ao final, caso julgue necessário
INPUTS_DIR = os.path.join(SAMPLE_DIR, "inputs") # local de trabalho para processamento de dados de midia 
OUTPUTS_DIR = os.path.join(SAMPLE_DIR, "outputs") # local de trabalho para processamento de dados de midia 

#MASHUPS
VIDS_DIR = os.path.join(DATA_DIR, "vids") # dados gerais levantados durante projeto

#Inputs
VID_DIR = os.path.join(INPUTS_DIR, "vid_input") # local de trabalho para processamento de dados de midia 
AUDIO_DIR = os.path.join(INPUTS_DIR, "audio_input") # local de trabalho para processamento de dados de midia # # Criar diretórios e locais de trabalho
LGG_DIR = os.path.join(INPUTS_DIR, "lgg_input") # local de trabalho para processamento de dados de midia 

#Outputs
FRAME_DIR = os.path.join(OUTPUTS_DIR, "thumbnails") # local de trabalho para processamento de dados de midia 
SONG_DIR = os.path.join(OUTPUTS_DIR, "songs") # local de trabalho para processamento de dados de midia # # Criar diretórios e locais de trabalho
LYRIC_DIR = os.path.join(OUTPUTS_DIR, "lyrics") # local de trabalho para processamento de dados de midia 

#Make_dirs
os.makedirs(DATA_DIR, exist_ok=True)
os.makedirs(META_DIR, exist_ok=True)
os.makedirs(SAMPLE_DIR, exist_ok=True)
os.makedirs(INPUTS_DIR, exist_ok=True)
os.makedirs(OUTPUTS_DIR, exist_ok=True)

#Make_vids
os.makedirs(VIDS_DIR, exist_ok=True)

##Make_dirs inputs
os.makedirs(VID_DIR , exist_ok=True)
os.makedirs(AUDIO_DIR, exist_ok=True)
os.makedirs(LGG_DIR, exist_ok=True)

##Make_dirs outputs
os.makedirs(FRAME_DIR, exist_ok=True)
os.makedirs(SONG_DIR, exist_ok=True)
os.makedirs(LYRIC_DIR, exist_ok=True)

### loading document to train and set the decision tree model

In [125]:
os.chdir(META_DIR)
os.listdir()

['.Rhistory',
 '.DS_Store',
 'MlKeywordsClassified.csv',
 'Vid24_WhiteStripesNirvana_ManualTagged.csv',
 'Vid24_WhiteStripesNirvana_ML_Pred.csv']

In [105]:
# col_names to be use as dependent variable 

# col_names = ['file', 'imageID', 'Keywords', 'brightness_median', 'brightness_stdev','saturation_median', 
#              'saturation_stdev', 'hue_median', 'hue_stdev',
#              'Count', 'Total Area', 'Average Size', '%Area']

col_names = ['imageID', 'Keywords', 'brightness_median', 'brightness_stdev',
       'saturation_median', 'saturation_stdev', 'hue_median', 'hue_stdev',
       'Count', 'Total Area', 'Average Size', '%Area']

df = pd.read_csv('Vid24_WhiteStripesNirvana_ManualTagged.csv', header=0, names=col_names)


In [None]:
# df = df.dropna()

In [106]:
df.columns

Index(['imageID', 'Keywords', 'brightness_median', 'brightness_stdev',
       'saturation_median', 'saturation_stdev', 'hue_median', 'hue_stdev',
       'Count', 'Total Area', 'Average Size', '%Area'],
      dtype='object')

### Splitting Data

In [107]:
# divide given columns into two variables: dependent (or target variable) and independent (or feature variables)
feature_cols = ['imageID', 'brightness_median', 'brightness_stdev',
       'saturation_median', 'saturation_stdev', 'hue_median', 'hue_stdev',
       'Count', 'Total Area', 'Average Size', '%Area',]

X = df[feature_cols] #  dependent: feature variables
y = df.Keywords # independent: target variable



In [108]:
# Split dataset: training set and test set => 70% training and 30% test

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) 

### Decision Tree Model

In [111]:
# Create Decision Tree classifer object

clf = DecisionTreeClassifier(criterion = 'gini') # the the regular criterion is 'gini', it was changed to 'entropy'. Also, the max_depth may be set as well.

# Train Decision Tree Classifer
clf = clf.fit(X_train,y_train)
#Predict the response for test dataset
y_pred = clf.predict(X_test)

### Evaluating  - How often is the classifier correct? 

In [112]:
# Model Accuracy
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.925


## Tagging

In [124]:
os.listdir()

['.Rhistory',
 '.DS_Store',
 'MlKeywordsClassified.csv',
 'Vid24_WhiteStripesNirvana_ManualTagged.csv',
 'Vid24_WhiteStripesNirvana_ML_Pred.csv']

In [114]:
col_names = [ 'file', 'imageID', 'brightness_median', 'brightness_stdev',
       'saturation_median', 'saturation_stdev', 'hue_median', 'hue_stdev',
       'Count', 'Total Area', 'Average Size', '%Area']

k_pred = pd.read_csv('Vid24_WhiteStripesNirvana_ML_Pred.csv', header=0, names=col_names)

In [115]:
k_pred.columns
k_pred = k_pred.dropna()

In [119]:
k_pred = k_pred.drop(columns=['file'])
k_pred.head()

Unnamed: 0,imageID,saturation_median,saturation_stdev,hue_median,hue_stdev,Count,Total Area,Average Size,%Area
0,1,85,49.0545,106,29.5308,9,163049,18116.556,88
1,2,0,18.3077,0,13.7129,0,0,0.0,0
2,3,0,18.3077,0,13.7129,0,0,0.0,0
3,4,0,18.3077,0,13.7129,0,0,0.0,0
4,5,255,50.7772,106,22.2292,4,690,172.5,0


In [120]:
# Create Decision Tree classifer object
clf = DecisionTreeClassifier()

# Train Decision Tree Classifer
clf = clf.fit(X_train, y_train)

#Predict the response for test dataset
y_pred = clf.predict(k_pred)

In [121]:
k = []

for i, row in enumerate(y_pred):
    k.append((row))
    
k = pd.DataFrame(k, columns=["Keywords"])

In [122]:
print(k)

                         Keywords
0       B_Instrumental_main-track
1     G_Lyrics_complementarytrack
2     G_Lyrics_complementarytrack
3     G_Lyrics_complementarytrack
4       B_Instrumental_main-track
...                           ...
2625  G_Lyrics_complementarytrack
2626  G_Lyrics_complementarytrack
2627  G_Lyrics_complementarytrack
2628  G_Lyrics_complementarytrack
2629  G_Lyrics_complementarytrack

[2630 rows x 1 columns]


In [123]:
k.to_csv("MlKeywordsClassified2.csv")