In [16]:
#1. Data Exploration
import pandas as pd

# Load your datasets
train_df = pd.read_csv('Titanic_train.csv')
test_df = pd.read_csv('titanic_test.csv')

# View the first few rows
print(train_df.head())
print(test_df.head())



   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  
  

In [17]:
#2. Explore the Data

#Check for missing values, data types, and distributions:
print(train_df.info())
print(train_df.describe())
print(train_df.isnull().sum())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
None
       PassengerId    Survived      Pclass         Age       SibSp  \
count   891.000000  891.000000  891.000000  714.000000  891.000000   
mean    446.000000    0.383838    2.308642   29.699118    0.523008   
std     257.353842    0.48659

In [18]:
#3: Preprocess the Data

#We'll clean both training and test data the same way:
# Fill missing Age with median
train_df['Age'].fillna(train_df['Age'].median(), inplace=True)
test_df['Age'].fillna(test_df['Age'].median(), inplace=True)

# Fill missing Fare in test set
test_df['Fare'].fillna(test_df['Fare'].median(), inplace=True)

# Drop 'Cabin' due to too many missing values
train_df.drop(columns=['Cabin'], inplace=True)
test_df.drop(columns=['Cabin'], inplace=True)

# Fill Embarked with mode
train_df['Embarked'].fillna(train_df['Embarked'].mode()[0], inplace=True)

# Convert Sex and Embarked to numeric using one-hot encoding
train_df = pd.get_dummies(train_df, columns=['Sex', 'Embarked'], drop_first=True)
test_df = pd.get_dummies(test_df, columns=['Sex', 'Embarked'], drop_first=True)

# Align columns
train_df, test_df = train_df.align(test_df, join='left', axis=1, fill_value=0)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_df['Age'].fillna(train_df['Age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_df['Age'].fillna(test_df['Age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object 

In [19]:
#4. Feature Selection
#Choose input features and the target variable:
# Drop irrelevant columns
X = train_df.drop(columns=['PassengerId', 'Name', 'Ticket', 'Survived'])
y = train_df['Survived']

# For test data
X_test_final = test_df.drop(columns=['PassengerId', 'Name', 'Ticket'])


In [20]:
#5. Train Logistic Regression Model
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Split training data for validation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Evaluate
y_pred = model.predict(X_val)
print("Accuracy:", accuracy_score(y_val, y_pred))
print(classification_report(y_val, y_pred))


Accuracy: 0.8100558659217877
              precision    recall  f1-score   support

           0       0.83      0.86      0.84       105
           1       0.79      0.74      0.76        74

    accuracy                           0.81       179
   macro avg       0.81      0.80      0.80       179
weighted avg       0.81      0.81      0.81       179



In [21]:
if 'Survived' in test_df.columns:
    test_df = test_df.drop(columns=['Survived'])

In [22]:
X_test_final = test_df.drop(columns=['PassengerId', 'Name', 'Ticket'])
X_test_final = test_df.drop(columns=['PassengerId', 'Name', 'Ticket', 'Survived'], errors='ignore')

In [23]:
#6. Predict on Test Set
# Predict on your test dataset
predictions = model.predict(X_test_final)

# Create a submission file (if needed for Kaggle)
submission = pd.DataFrame({
    'PassengerId': test_df['PassengerId'],

})

submission.to_csv('submission.csv', index=False)

In [24]:
df = pd.read_csv('submission.csv')
df

Unnamed: 0,PassengerId
0,892
1,893
2,894
3,895
4,896
...,...
413,1305
414,1306
415,1307
416,1308


In [25]:
import pickle

# Save the trained model to a file
with open('logistic_model.pkl', 'wb') as file:
    pickle.dump(model, file)

print("Model saved as 'logistic_model.pkl'")

Model saved as 'logistic_model.pkl'


In [26]:
app_code = '''
import streamlit as st
import pickle
import numpy as np

# Load the trained model
with open("logistic_model.pkl", "rb") as f:
    model = pickle.load(f)

st.title("Logistic Regression Predictor App")

st.write("Enter the input features to make a prediction:")

# Example input features (update as per your dataset)
# You must match this with your original model features
age = st.number_input("Age of the car", min_value=0, value=5)
km = st.number_input("Kilometers Driven", min_value=0, value=50000)
fuel_diesel = st.selectbox("Fuel Type - Diesel?", ["No", "Yes"])
fuel_petrol = st.selectbox("Fuel Type - Petrol?", ["No", "Yes"])
hp = st.number_input("Horse Power", min_value=0, value=90)
automatic = st.selectbox("Automatic Transmission?", ["No", "Yes"])
doors = st.number_input("Number of Doors", min_value=2, max_value=5, value=4)
weight = st.number_input("Car Weight (kg)", min_value=500, value=1000)

# Convert categorical inputs to binary
fuel_diesel = 1 if fuel_diesel == "Yes" else 0
fuel_petrol = 1 if fuel_petrol == "Yes" else 0
automatic = 1 if automatic == "Yes" else 0

# Create input array
input_data = np.array([[age, km, fuel_diesel, fuel_petrol, hp, automatic, doors, weight]])

# Prediction
if st.button("Predict"):
    result = model.predict(input_data)
    st.success(f"Prediction: {'Will Purchase' if result[0] == 1 else 'Will Not Purchase'}")
'''

# Save to file
with open("streamlit_app.py", "w") as f:
    f.write(app_code)

print("Streamlit app file 'streamlit_app.py' created.")


Streamlit app file 'streamlit_app.py' created.


# Following command needs to be run on the terminal outside the notebook editor:
#  streamlit run streamlit_app.py