Example 1 (Cancer prediction based on tumor size)

In [1]:
import numpy
from sklearn import linear_model

#Reshaped for Logistic function.
X = numpy.array([3.78, 2.44, 2.09, 0.14, 1.72, 1.65, 4.92, 4.37, 4.96, 4.52, 3.69, 5.88]).reshape(-1,1)
y = numpy.array([0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1])

logr = linear_model.LogisticRegression()
logr.fit(X,y)

#predict if tumor is cancerous where the size is 3.46mm:
predicted = logr.predict(numpy.array([3.78]).reshape(-1,1))
print(predicted)

[1]


Example 2: Iris Dataset

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# Step 1: Load the dataset
data = sns.load_dataset('iris')
data.sample(10)


Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
14,5.8,4.0,1.2,0.2,setosa
61,5.9,3.0,4.2,1.5,versicolor
31,5.4,3.4,1.5,0.4,setosa
89,5.5,2.5,4.0,1.3,versicolor
29,4.7,3.2,1.6,0.2,setosa
136,6.3,3.4,5.6,2.4,virginica
128,6.4,2.8,5.6,2.1,virginica
65,6.7,3.1,4.4,1.4,versicolor
19,5.1,3.8,1.5,0.3,setosa
115,6.4,3.2,5.3,2.3,virginica


In [None]:
data['variety'].values

In [None]:
# Step 2: Data Preprocessing - Encode the target variable 'variety'
label_encoder = LabelEncoder()
data['variety'] = label_encoder.fit_transform(data['variety'])
data


In [None]:
# Step 3: Define features (X) and target (y)
X = data.drop(columns=['variety'])  # Features: Sepal length, Sepal width, Petal length, Petal width
y = data['variety']                # Target: Encoded flower variety

In [None]:
# Step 4: Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=15)

In [None]:
# Step 5: Apply Logistic Regression
logistic_model = LogisticRegression(max_iter=200)
logistic_model.fit(X_train, y_train)

In [None]:
# Step 6: Predict on the test set
y_pred = logistic_model.predict(X_test)
len(y_pred)

In [None]:
# Step 7: Evaluate the model performance
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred, target_names=label_encoder.classes_)

In [None]:
# Step 8: Compute the confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
conf_matrix

In [None]:
# Step 9: Visualize the confusion matrix
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix')
plt.show()

In [None]:
# Print results
print(f"Accuracy: {accuracy}")
print("Classification Report:\n", classification_rep)

In [None]:
# Example input for a new Iris flower with feature values:
new_input = [[2.9, 3.0, 1.1, 4.8]]  # sepal.length, sepal.width, petal.length, petal.width

# Convert the input to a DataFrame with the same feature names
new_input_df = pd.DataFrame(new_input, columns=['sepal.length', 'sepal.width', 'petal.length', 'petal.width'])

# Use the trained logistic regression model to predict the class
predicted_class = logistic_model.predict(new_input_df)

# Convert the predicted numerical class back to the original flower name
predicted_flower = label_encoder.inverse_transform(predicted_class)

print(f"Predicted Flower: {predicted_flower[0]}")


Example 3: Diabetes dataset

In [2]:
# Load the dataset
data = pd.read_csv('./../datafiles/diabetes.csv')

In [3]:
# Display the first few rows of the dataset
print(data.head())

   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  


In [None]:
# Features and target variable
X = data.drop('Outcome', axis=1)
y = data['Outcome']

In [None]:
# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Creating and training the logistic regression model
model = LogisticRegression(max_iter=200,class_weight='balanced')
model.fit(X_train, y_train)

# Making predictions on the test set
y_pred = model.predict(X_test)

In [None]:
# After training and before making predictions
print("Class distribution in target variable:")
print(y.value_counts())

# Making predictions on the test set
y_pred = model.predict(X_test)

# Check unique predictions
print("Unique predictions:", np.unique(y_pred))

# Generate confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)

# Classification report
report = classification_report(y_test, y_pred)
print("Classification Report:")
print(report)


In [None]:
# Visualizing the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='flare', 
            xticklabels=['No Diabetes', 'Diabetes'], 
            yticklabels=['No Diabetes', 'Diabetes'])

# Add labels, title, and adjust for readability
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Confusion Matrix')
plt.show()

In [None]:
# Testing with a new input
new_data = pd.DataFrame([[5, 116, 74, 0, 0, 25.6, 0.201, 30]], 
                         columns=X.columns)  # Use the same columns as X
prediction = model.predict(new_data)
print(f"Predicted Outcome for input {new_data.iloc[0].values}: {prediction[0]}")

In [3]:
from sklearn.linear_model import LinearRegression
import numpy as np

# Sample data (independent variable X and dependent variable Y)
X = np.array([[1], [2], [3], [4], [5]])  # Reshape for sklearn (expects 2D array)
Y = np.array([2, 4, 5, 4, 5])

# Create a linear regression model
model = LinearRegression()

# Fit the model with the data
model.fit(X, Y)

# Get the slope (beta_1) and intercept (beta_0)
beta_1 = model.coef_[0]
beta_0 = model.intercept_

# Output
print(f"Slope (beta_1): {beta_1}")
print(f"Intercept (beta_0): {beta_0}")


Slope (beta_1): 0.6
Intercept (beta_0): 2.2
