In [6]:
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
from fuzzywuzzy import fuzz

# Load dataset (replace with actual file)
df = pd.read_csv("/home/developer/projects/my-web-app/data/market_data.csv")

# 1. Detect Missing Values
missing_values = df.isnull().sum()
print("Missing Values:\n", missing_values)

# 2. Identify Duplicates
duplicates = df[df.duplicated()]
print("Duplicate Records:\n", duplicates)

# 3. Anomaly Detection using Isolation Forest
iso_forest = IsolationForest(contamination=0.05, random_state=42)
df['anomaly_score'] = iso_forest.fit_predict(df.select_dtypes(include=[np.number]))
anomalies = df[df['anomaly_score'] == -1]
print("Anomalies Detected:\n", anomalies)

# 4. Fuzzy Matching for Possible Duplicates (Example on 'Name' column)
def fuzzy_duplicate_check(col, threshold=80):
    potential_duplicates = []
    for i in range(len(col)):
        for j in range(i+1, len(col)):
            similarity = fuzz.ratio(str(col.iloc[i]), str(col.iloc[j]))
            if similarity >= threshold:
                potential_duplicates.append((col.iloc[i], col.iloc[j], similarity))
    return potential_duplicates

if 'Name' in df.columns:
    fuzzy_duplicates = fuzzy_duplicate_check(df['Name'])
    print("Potential Duplicate Names:", fuzzy_duplicates)

# Save cleaned dataset
df.to_csv("/home/developer/projects/my-web-app/data/cleaned_data.csv", index=False)
print("Data cleaning complete. Processed file saved as 'cleaned_data.csv'")


Missing Values:
 Date         1
Open         4
High         4
Low          4
Close        4
Adj Close    1
Volume       4
Ticker       1
Open.1       4
High.1       4
Low.1        4
Close.1      4
Volume.1     4
Open.2       4
High.2       4
Low.2        4
Close.2      4
Volume.2     4
Open.3       4
High.3       4
Low.3        4
Close.3      4
Volume.3     4
Open.4       4
High.4       4
Low.4        4
Close.4      4
Volume.4     4
dtype: int64
Duplicate Records:
 Empty DataFrame
Columns: [Date, Open, High, Low, Close, Adj Close, Volume, Ticker, Open.1, High.1, Low.1, Close.1, Volume.1, Open.2, High.2, Low.2, Close.2, Volume.2, Open.3, High.3, Low.3, Close.3, Volume.3, Open.4, High.4, Low.4, Close.4, Volume.4]
Index: []

[0 rows x 28 columns]
Anomalies Detected:
 Empty DataFrame
Columns: [Date, Open, High, Low, Close, Adj Close, Volume, Ticker, Open.1, High.1, Low.1, Close.1, Volume.1, Open.2, High.2, Low.2, Close.2, Volume.2, Open.3, High.3, Low.3, Close.3, Volume.3, Open.4, High.4, 