# Deforestation Data Cleaning

### Imports and Expirement Variables:

In [None]:
import pandas as pd
from tensorflow.keras.applications.resnet50 import ResNet50
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras import callbacks
from tensorflow.keras.metrics import AUC, Recall, Accuracy

In [32]:
#They are expirement variables that are meant to allow me to replicate the process with different models later
color_mode = 'rgb' #flow_from_dataframe
class_mode = 'binary' #flow_from_dataframe. For now, later on we will investigate multi-layer output
if color_mode == 'rgb': #control flow to build input layer of the model accordingly
    input_shape = (256,256,3)
elif color_mode == 'rgba':
    input_shape = (256,256,4)
image_train_path = 'raw_data/raw_data/train-jpg/train-jpg' #flow_from_dataframe
batch_size = 128  #expirement variable

### Data Preperation:

In [3]:
#Load and inspect data
df = pd.read_csv('raw_data/raw_data/train_v2.csv/train_v2.csv')
df

Unnamed: 0,image_name,tags
0,train_0,haze primary
1,train_1,agriculture clear primary water
2,train_2,clear primary
3,train_3,clear primary
4,train_4,agriculture clear habitation primary road
...,...,...
40474,train_40474,clear primary
40475,train_40475,cloudy
40476,train_40476,agriculture clear primary
40477,train_40477,agriculture clear primary road


In [4]:
def clean_row(row):
    '''
    labels of the data frame are a string that contains all landscapes that were detected by humans.
    At phase 1, we follow this logic: if there's just primary label (excluding weather and water)
    that's primary. If there's anything more then that- damaged.
    All cloudy photo are None due to lack of valubale information.
    '''

    label = 'damaged'
    kickout_labels = ['clear','cloudy', 'partly_cloudy', 'haze', 'blooming', 'water']
    tags_list = row.split()
    tags_list_relevant = [word for word in tags_list if word not in kickout_labels]
    
    if len(tags_list_relevant) == 0:
        label = None
        
    elif tags_list_relevant == ['primary']:
        label = 'primary'
    
    return label
        

In [5]:
#Clean the original tags column and create a clean df. multiple path to expirement with flow_from_df

df['label'] = df['tags'].map(clean_row)
df['image_filename'] = df['image_name'].map(lambda x: f'{x}.jpg')
df['image_path'] = df['image_name'].map(lambda x: f'{image_train_path}/{x}.jpg')
df

Unnamed: 0,image_name,tags,label,image_filename,image_path
0,train_0,haze primary,primary,train_0.jpg,raw_data/raw_data/train-jpg/train-jpg/train_0.jpg
1,train_1,agriculture clear primary water,damaged,train_1.jpg,raw_data/raw_data/train-jpg/train-jpg/train_1.jpg
2,train_2,clear primary,primary,train_2.jpg,raw_data/raw_data/train-jpg/train-jpg/train_2.jpg
3,train_3,clear primary,primary,train_3.jpg,raw_data/raw_data/train-jpg/train-jpg/train_3.jpg
4,train_4,agriculture clear habitation primary road,damaged,train_4.jpg,raw_data/raw_data/train-jpg/train-jpg/train_4.jpg
...,...,...,...,...,...
40474,train_40474,clear primary,primary,train_40474.jpg,raw_data/raw_data/train-jpg/train-jpg/train_40...
40475,train_40475,cloudy,,train_40475.jpg,raw_data/raw_data/train-jpg/train-jpg/train_40...
40476,train_40476,agriculture clear primary,damaged,train_40476.jpg,raw_data/raw_data/train-jpg/train-jpg/train_40...
40477,train_40477,agriculture clear primary road,damaged,train_40477.jpg,raw_data/raw_data/train-jpg/train-jpg/train_40...


In [6]:
df_clean = df[['image_filename', 'label', 'image_name', 'image_path']].dropna()
df_clean
#WE might need to change that later on and un-drop the clouds columns, and only drop or ignore them in training.

Unnamed: 0,image_filename,label,image_name,image_path
0,train_0.jpg,primary,train_0,raw_data/raw_data/train-jpg/train-jpg/train_0.jpg
1,train_1.jpg,damaged,train_1,raw_data/raw_data/train-jpg/train-jpg/train_1.jpg
2,train_2.jpg,primary,train_2,raw_data/raw_data/train-jpg/train-jpg/train_2.jpg
3,train_3.jpg,primary,train_3,raw_data/raw_data/train-jpg/train-jpg/train_3.jpg
4,train_4.jpg,damaged,train_4,raw_data/raw_data/train-jpg/train-jpg/train_4.jpg
...,...,...,...,...
40473,train_40473.jpg,damaged,train_40473,raw_data/raw_data/train-jpg/train-jpg/train_40...
40474,train_40474.jpg,primary,train_40474,raw_data/raw_data/train-jpg/train-jpg/train_40...
40476,train_40476.jpg,damaged,train_40476,raw_data/raw_data/train-jpg/train-jpg/train_40...
40477,train_40477.jpg,damaged,train_40477,raw_data/raw_data/train-jpg/train-jpg/train_40...


In [7]:
#save for later use
df_clean.to_csv('raw_data/train_clean_labels.csv', index = False)

In [8]:
#Check
df_clean = pd.read_csv('raw_data/train_clean_labels.csv', dtype = str)

### Build Baseline Model

In [None]:
metrics = [AUC(), Recall(), Accuracy()]

In [28]:
def build_baseline():
    '''
    This model will be used as a fast baseline, and as a quick solution to my colleagues who work
    on production
    '''
    
    resnet50 = ResNet50(include_top = False, input_shape = input_shape)
    resnet50.trainable = False
    baseline = Sequential()
    baseline.add(resnet50)
    baseline.add(Flatten())
    baseline.add(Dense(1, activation = 'sigmoid'))
    
    baseline.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=metrics)   #recall is the way to go 
    return baseline

In [29]:
MODEL = "baseline_model"

mcp = callbacks.ModelCheckpoint("ML/models/{}.h5".format(MODEL), monitor="val_loss", verbose=0, save_best_only=True)

lrr = callbacks.ReduceLROnPlateau(monitor="val_loss", factor = 0.1, patience=3, verbose=1, min_lr=0)

es = callbacks.EarlyStopping(monitor='val_loss', patience=15, verbose=0, restore_best_weights=True)

cs = [mcp, lrr, es]



In [30]:
baseline_model = build_baseline()

### Train Baseline Model

In [20]:
image_generator = ImageDataGenerator(validation_split = 0.2, rescale = 1./255)

In [33]:
train_generator = image_generator.flow_from_dataframe(
    dataframe = df_clean,
    directory = image_train_path,
    x_col = 'image_filename',
    y_col = 'label',
    color_mode = color_mode,
    class_mode = class_mode,
    batch_size = batch_size,
    subset = 'training',
    #validate_filenames = False
)

validation_generator = image_generator.flow_from_dataframe(
    dataframe = df_clean,
    directory = image_train_path,
    x_col = 'image_filename',
    y_col = 'label',
    color_mode = color_mode,
    class_moode = class_mode,
    batch_size = batch_size,
    subset = 'validation',
    #validate_filenames = False
)

Found 30479 validated image filenames belonging to 2 classes.
Found 7619 validated image filenames belonging to 2 classes.


In [34]:
baseline_history = baseline_model.fit(
        train_generator,
        epochs=300,
        validation_data=validation_generator,
        callbacks = cs)

Epoch 1/300


2022-12-15 14:33:28.007165: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 536870912 exceeds 10% of free system memory.
2022-12-15 14:33:28.745684: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 536870912 exceeds 10% of free system memory.
2022-12-15 14:33:29.097081: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 553779200 exceeds 10% of free system memory.
2022-12-15 14:33:30.736803: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 536870912 exceeds 10% of free system memory.
2022-12-15 14:33:31.356612: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 536870912 exceeds 10% of free system memory.


 48/239 [=====>........................] - ETA: 1:03:46 - loss: 2.1278 - accuracy: 0.5135

KeyboardInterrupt: 