In [None]:
import pandas as pd

df_2015 = pd.read_csv("ships_2015.csv")
df_2016 = pd.read_csv("ships_2016.csv")

# Impute missing LDT using 0.8 × GT
for df in [df_2015, df_2016]:
    df['LDT'] = df.apply(
        lambda row: row['LDT'] if pd.notnull(row['LDT']) else round(0.8 * row['GT'], 0),
        axis=1
    )


In [None]:
import pandas as pd

df_2015 = pd.read_csv("ships_2015.csv")
print(df_2015.columns.tolist())


['YEAR', 'IMO', 'NAME', 'TYPE', 'GT', 'LDT', 'BUILT', 'LAST_FLAG', 'PREVIOUS_FLAG', 'BENEFICIAL_OWNER', 'BO_COUNTRY', 'COMMERCIAL_OPERATOR', 'REGISTERED_OWNER', 'RO_COUNTRY', 'PLACE', 'COUNTRY', 'ARRIVAL']


In [None]:
df_2015.columns = df_2015.columns.str.strip().str.upper()
df_2016.columns = df_2016.columns.str.strip().str.upper()


In [None]:
from datetime import datetime

def format_arrival(date_str):
    try:
        parsed = pd.to_datetime(date_str, format='%d-%m-%Y', errors='coerce')
        if pd.notnull(parsed):
            return parsed.strftime('%d-%b')  # e.g., 18-May
        else:
            return date_str  # keep original if can't parse
    except:
        return date_str

for df in [df_2015, df_2016]:
    df['ARRIVAL'] = df['ARRIVAL'].astype(str).apply(format_arrival)




In [None]:
import pandas as pd

# Load the 2015 dataset
df_2015 = pd.read_csv("/content/ships_2015.csv")

# Convert column names to uppercase for consistency
df_2015.columns = df_2015.columns.str.strip().str.upper()

# Check if LDT column exists
if "LDT" in df_2015.columns:
    # Impute missing LDT values using median of the column
    df_2015["LDT"] = df_2015["LDT"].fillna(df_2015["LDT"].median())
else:
    print("⚠️ No 'LDT' column found in dataset!")

# Save cleaned 2015 dataset
df_2015.to_csv("/content/ships_2015_cleaned.csv", index=False)
print("✅ LDT values imputed and file saved: /content/ships_2015_cleaned.csv")


✅ LDT values imputed and file saved: /content/ships_2015_cleaned.csv


In [None]:
df_2015.to_csv("ships_2015_cleaned.csv", index=False)
df_2016.to_csv("ships_2016_cleaned.csv", index=False)


In [None]:
import pandas as pd
from google.colab import files

# --- Step 1: Load your CSV files ---
df_2015 = pd.read_csv("ships_2015_cleaned.csv")
df_2016 = pd.read_csv("ships_2016_cleaned.csv")
df_2022_2024 = pd.read_csv("ships_2022_2024.csv")

# --- Step 2: Clean column names ---
for df in [df_2015, df_2016, df_2022_2024]:
    df.columns = df.columns.str.strip().str.upper()

# --- Step 3: Align columns ---
common_cols = list(set(df_2015.columns) & set(df_2016.columns) & set(df_2022_2024.columns))
df_2015 = df_2015[common_cols]
df_2016 = df_2016[common_cols]
df_2022_2024 = df_2022_2024[common_cols]

# --- Step 4: Merge and sort ---
merged_df = pd.concat([df_2015, df_2016, df_2022_2024], ignore_index=True)
merged_df = merged_df.sort_values(by="YEAR").reset_index(drop=True)

# --- Step 5: Save merged CSV ---
output_path = "ships_2015_2024_complete.csv"
merged_df.to_csv(output_path, index=False)
print("✅ Merged file created:", output_path)

# --- Step 6: Download to your computer ---
files.download(output_path)


✅ Merged file created: ships_2015_2024_complete.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
import pandas as pd
from google.colab import files

# Load the merged dataset (replace with your actual merged file name)
merged = pd.read_csv("/content/ships_2015_2024_complete.csv")

# Desired column order
cols = [
    "YEAR","IMO","NAME","TYPE","GT","LDT","BUILT",
    "LAST_FLAG","PREVIOUS_FLAG","BENEFICIAL_OWNER","BO_COUNTRY",
    "COMMERCIAL_OPERATOR","REGISTERED_OWNER","RO_COUNTRY",
    "PLACE","COUNTRY","ARRIVAL"
]

# Reorder columns
merged = merged[cols]



In [None]:
import pandas as pd
from google.colab import files

# Load dataset
df = pd.read_csv("/content/ships_complete.csv")

# Standardize column names
df.columns = df.columns.str.strip().str.upper()

# Convert GT and LDT to numeric safely
df["GT"] = pd.to_numeric(df["GT"], errors="coerce")
df["LDT"] = pd.to_numeric(df["LDT"], errors="coerce")

# Impute missing LDT = 0.8 * GT
missing_before = df["LDT"].isna().sum()
df.loc[df["LDT"].isna(), "LDT"] = 0.8 * df.loc[df["LDT"].isna(), "GT"]
missing_after = df["LDT"].isna().sum()

print(f"✅ Imputed {missing_before - missing_after} missing LDT values using 0.8 × GT.")

# Save and download
output_path = "/content/ships_complete_imputed.csv"
df.to_csv(output_path, index=False)
files.download(output_path)


✅ Imputed 823 missing LDT values using 0.8 × GT.


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
import pandas as pd

# Load data
df = pd.read_csv("ships_complete.csv")

# Clean GT column: remove commas, spaces, and convert to float
df['GT'] = (
    df['GT']
    .astype(str)
    .str.replace(',', '', regex=False)
    .str.strip()
    .replace('', '0')  # handle blanks
    .astype(float)
)

# Convert LDT to numeric (in case it’s stored as text)
df['LDT'] = pd.to_numeric(df['LDT'], errors='coerce')

# Check missing before
print("Missing LDTs before:", df['LDT'].isna().sum())

# Impute only missing LDTs with 0.8 * GT
mask = df['LDT'].isna()
df.loc[mask, 'LDT'] = 0.8 * df.loc[mask, 'GT']

# Check after
print("Missing LDTs after:", df['LDT'].isna().sum())

# Save cleaned dataset
df.to_csv("ships_all_complete_imputed.csv", index=False)
print("✅ Imputation complete and saved as 'ships_all_complete_imputed.csv'")


Missing LDTs before: 1891
Missing LDTs after: 8
✅ Imputation complete and saved as 'ships_complete_imputed.csv'


PHASE 3 Start:

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report

# 1️⃣ Load dataset
df = pd.read_csv("/content/ships_complete_imputed.csv")
df.columns = df.columns.str.strip().str.upper()

# 2️⃣ Create REGION target variable
south_asia = ['India', 'Bangladesh', 'Pakistan']
df['REGION'] = df['COUNTRY'].apply(lambda x: 'South Asia' if x in south_asia else 'Other')

# 3️⃣ Select features
features = ["LDT", "TYPE", "BUILT", "LAST_FLAG", "GT"]
X = df[features].copy()  # use copy to avoid SettingWithCopyWarning
y = df["REGION"]

# 4️⃣ Clean numeric columns (remove commas and convert to float)
for col in ["LDT", "GT", "BUILT"]:
    X[col] = X[col].astype(str).str.replace(",", "").astype(float)

# 5️⃣ Encode categorical variables
label_cols = ["TYPE", "LAST_FLAG"]
encoder = LabelEncoder()
for col in label_cols:
    X[col] = encoder.fit_transform(X[col].astype(str))

# 6️⃣ Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

# 7️⃣ Train Naive Bayes model
model = GaussianNB()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# 8️⃣ Evaluate
acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')
print(f"Accuracy: {acc:.3f}")
print(f"F1-Score: {f1:.3f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# 9️⃣ Confusion Matrix
cm = confusion_matrix(y_test, y_pred, labels=['South Asia', 'Other'])
plt.figure(figsize=(6,4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=['South Asia', 'Other'], yticklabels=['South Asia', 'Other'])
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()

# 🔟 Plot class distribution
plt.figure(figsize=(6,4))
sns.countplot(x='REGION', data=df)
plt.title("Class Distribution in Dataset")
plt.show()


ValueError: Input X contains NaN.
GaussianNB does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values