<a href="https://colab.research.google.com/github/dajley/Analyzing-Outliers-Research-Project/blob/main/Adding_Outliers_to_Main_Dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [18]:
#Adding Outliers to Main Dataset
#Code and Research by Dajanique Leysath

#Install Libraries

In [19]:
import pandas as pd

In [20]:
import numpy as np

#Dataset

In [21]:
#Stores CSV file into dataframe
df = pd.read_csv('/content/drive/MyDrive/Datasets/df_main.csv')

In [22]:
df.head()

Unnamed: 0,Value,Avg Max Temp,Avg Min Temp,Avg Temp,Avg Bare Soil Temp,Avg Turf Soil Temp,Avg Wind Speed,Avg Max Wind Speed,Total Solar Rad,Avg Penman PET,Total Penman PET,Total Rainfall,Avg Dew Point,Avg Wind Chill
0,104.9,60.9285,37.1985,49.0635,50.249,50.8615,7.858,23.18,489.158,0.1835,5.617,1.381,34.8855,46.0855
1,59.7,53.1395,31.8545,42.497,47.227,46.505,9.866,28.0395,447.4045,0.191,5.849,2.2715,30.7485,37.6335
2,43.0,60.167,33.72,46.9435,52.5145,51.8025,7.575,23.952,442.318,0.2075,6.338,2.186,28.4045,44.637
3,76.8,58.649,32.194,45.422,52.1705,48.8195,8.3395,25.6365,488.8215,0.2055,6.2715,0.74,30.5045,42.399
4,94.9,57.266,34.1885,45.7275,49.0,47.4995,7.519,23.1775,396.1995,0.168,5.1335,2.2735,32.663,42.445


In [23]:
df.shape

(286, 14)

In [24]:
df.columns[0]

'Value'

In [25]:
df.columns[1:14]

Index(['Avg Max Temp', 'Avg Min Temp', 'Avg Temp', 'Avg Bare Soil Temp',
       'Avg Turf Soil Temp', 'Avg Wind Speed', 'Avg Max Wind Speed',
       'Total Solar Rad', 'Avg Penman PET', 'Total Penman PET',
       'Total Rainfall', 'Avg Dew Point', 'Avg Wind Chill'],
      dtype='object')

#Add Outliers

In [26]:
#Function that gets 10 random values from columns
#Multiplies five values by 10
#Divides five values by 10
def modify_random_values(df, seed=None):

    if seed is not None:
        np.random.seed(seed)

    df_modified = df.copy()

    for col in df.columns[1:14]:
        indices = np.random.choice(df.index, size=10, replace=False)
        df_modified.loc[indices[:5], col] *= 10
        df_modified.loc[indices[5:], col] /= 10

    return df_modified

# Modify the DataFrame
df_modified = modify_random_values(df, seed=42)

In [27]:
df.columns[1:14]

Index(['Avg Max Temp', 'Avg Min Temp', 'Avg Temp', 'Avg Bare Soil Temp',
       'Avg Turf Soil Temp', 'Avg Wind Speed', 'Avg Max Wind Speed',
       'Total Solar Rad', 'Avg Penman PET', 'Total Penman PET',
       'Total Rainfall', 'Avg Dew Point', 'Avg Wind Chill'],
      dtype='object')

In [28]:
#Columns to modify
cols_to_modify = df.columns[1:14]

#Create a boolean mask where values differ
diff_mask = df[cols_to_modify] != df_modified[cols_to_modify]

#Extract only the differing values from original and modified
diff_original = df[cols_to_modify].where(diff_mask)
diff_modified = df_modified[cols_to_modify].where(diff_mask)

#Combine into a multi-index DataFrame for easier comparison
diff_combined = pd.concat(
    [diff_original.stack(), diff_modified.stack()],
    axis=1,
    keys=['Original', 'Modified']
)

print("Differences between original and modified DataFrames:\n")
print(diff_combined)

Differences between original and modified DataFrames:

                        Original    Modified
7   Avg Wind Speed        9.6660     0.96660
9   Avg Max Temp         60.3065   603.06500
    Total Rainfall        2.6950     0.26950
12  Avg Turf Soil Temp   47.9070     4.79070
    Avg Dew Point        37.0060   370.06000
...                          ...         ...
267 Total Solar Rad     475.4230  4754.23000
268 Total Solar Rad     471.2000    47.12000
272 Total Penman PET      6.3000     0.63000
277 Avg Turf Soil Temp   50.0430   500.43000
283 Avg Max Temp         62.0395     6.20395

[130 rows x 2 columns]


#Comparing Data

In [29]:
diff_original.shape

(286, 13)

In [30]:
diff_modified.shape

(286, 13)

In [31]:
diff_combined.shape

(130, 2)

In [32]:
diff_combined.head(25)

Unnamed: 0,Unnamed: 1,Original,Modified
7,Avg Wind Speed,9.666,0.9666
9,Avg Max Temp,60.3065,603.065
9,Total Rainfall,2.695,0.2695
12,Avg Turf Soil Temp,47.907,4.7907
12,Avg Dew Point,37.006,370.06
14,Avg Turf Soil Temp,45.471,454.71
15,Avg Turf Soil Temp,44.252,442.52
20,Avg Wind Speed,8.3395,0.83395
21,Avg Penman PET,0.222,2.22
22,Avg Turf Soil Temp,52.203,522.03


In [33]:
diff_original.iloc[0:25]

Unnamed: 0,Avg Max Temp,Avg Min Temp,Avg Temp,Avg Bare Soil Temp,Avg Turf Soil Temp,Avg Wind Speed,Avg Max Wind Speed,Total Solar Rad,Avg Penman PET,Total Penman PET,Total Rainfall,Avg Dew Point,Avg Wind Chill
0,,,,,,,,,,,,,
1,,,,,,,,,,,,,
2,,,,,,,,,,,,,
3,,,,,,,,,,,,,
4,,,,,,,,,,,,,
5,,,,,,,,,,,,,
6,,,,,,,,,,,,,
7,,,,,,9.666,,,,,,,
8,,,,,,,,,,,,,
9,60.3065,,,,,,,,,,2.695,,


In [34]:
diff_modified.iloc[0:25]

Unnamed: 0,Avg Max Temp,Avg Min Temp,Avg Temp,Avg Bare Soil Temp,Avg Turf Soil Temp,Avg Wind Speed,Avg Max Wind Speed,Total Solar Rad,Avg Penman PET,Total Penman PET,Total Rainfall,Avg Dew Point,Avg Wind Chill
0,,,,,,,,,,,,,
1,,,,,,,,,,,,,
2,,,,,,,,,,,,,
3,,,,,,,,,,,,,
4,,,,,,,,,,,,,
5,,,,,,,,,,,,,
6,,,,,,,,,,,,,
7,,,,,,0.9666,,,,,,,
8,,,,,,,,,,,,,
9,603.065,,,,,,,,,,0.2695,,


In [35]:
df_modified.describe()

Unnamed: 0,Value,Avg Max Temp,Avg Min Temp,Avg Temp,Avg Bare Soil Temp,Avg Turf Soil Temp,Avg Wind Speed,Avg Max Wind Speed,Total Solar Rad,Avg Penman PET,Total Penman PET,Total Rainfall,Avg Dew Point,Avg Wind Chill
count,286.0,286.0,286.0,286.0,286.0,286.0,286.0,286.0,286.0,286.0,286.0,286.0,286.0,286.0
mean,94.711538,67.560204,40.619966,54.220327,56.318528,53.298813,11.378281,28.812858,511.041068,0.223275,6.958245,2.177937,39.246424,50.490129
std,35.748752,71.160593,39.867773,56.735327,61.450905,58.333088,12.847519,31.674484,528.266306,0.213927,7.53602,2.80387,41.124466,56.417984
min,25.4,5.19535,3.53345,4.53725,4.60665,4.2655,0.83395,2.45205,41.3334,0.0143,0.42845,0.0795,3.3092,4.1985
25%,66.3,56.5165,33.6525,45.03225,46.244,43.16975,8.975375,23.551125,427.6245,0.1805,5.51525,0.9625,32.19775,40.595
50%,92.45,59.756,35.9945,47.66,49.103,46.5335,9.8975,25.1385,446.817,0.2025,6.16825,1.63525,34.638,44.26425
75%,120.625,61.787,38.08275,49.7595,51.857375,49.47875,10.67,26.611125,475.534,0.218875,6.689,2.548,36.446,46.641875
max,198.0,623.305,351.24,498.645,528.51,522.03,132.53,289.54,4754.23,2.22,73.175,30.01,370.06,507.08
