In [1]:
# Import the required modules
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

## Step 1: Read in the dataset about the current customers of the startup.

In [3]:
# Read the usage_stats.csv file from the Resources folder into a Pandas DataFrame
customer_df = pd.read_csv(Path('../Resources/usage_stats.csv'))

# Review the DataFrame
customer_df.head()

Unnamed: 0,Usage Stats,Referral History,Customer Rank,target
0,1.054075,-2.010163,-0.918689,0
1,2.033251,-0.212776,-2.947451,0
2,1.049233,-2.239878,-0.77708,0
3,0.837035,-1.926558,-1.113686,0
4,1.19377,-1.550953,-1.539586,0


## Step 2: Split the data into X and y and then into testing and training sets.

In [4]:
# Split the data into X (features) and y (target)
y = customer_df["target"]
X = customer_df.drop(columns="target")

In [5]:
# The y variable should focus on the target column
y.shape

(1210,)

In [7]:
# The X variable should include all features except the target
X.shape

(1210, 3)

In [8]:
# Split into testing and training sets using train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
                                                   y,
                                                   random_state=1,
                                                   stratify=y)


## Step 3: Fit a logistic regression classifier.

In [19]:
# Declare a logistic regression model.
# Apply a random_state of 9 to the model
logistic_regression_model = LogisticRegression(solver='lbfgs', random_state=9)

# Fit and save the logistic regression model using the training data
# can make this into a variable
lr_model = logistic_regression_model.fit(X_train,y_train)

## Step 4: Create the predicted values for the testing and the training data.

In [20]:
#Generate training predictions
training_predictions = lr_model.predict(X_train)

#Generate testing predictions
testing_predictions = logistic_regression_model.predict(X_test)


## Step 5: Print a confusion matrix for the training data.

In [21]:
# Import the model for sklearn's confusion matrix
from sklearn.metrics import confusion_matrix

# Create and save the confusion matrix for the training data
training_matrix = pd.DataFrame(confusion_matrix(y_train, training_predictions))

# Print the confusion matrix for the training data
print(training_matrix)

     0   1
0  810   6
1   16  75


## Step 6: Pring a confusion matrix for the texting data.

In [22]:
# Create and save the confusion matrix for the testing data
test_matrix = pd.DataFrame(confusion_matrix(y_test, testing_predictions))

# Print the confusion matrix for the testing data
print(test_matrix)

     0   1
0  268   5
1    3  27


## Step 7: Print the training classification report.

In [23]:
# Create and save the training classification report
training_report = classification_report(y_train, training_predictions)

# Print the training classification report
print(training_report)

              precision    recall  f1-score   support

           0       0.98      0.99      0.99       816
           1       0.93      0.82      0.87        91

    accuracy                           0.98       907
   macro avg       0.95      0.91      0.93       907
weighted avg       0.98      0.98      0.98       907



## Step 8: Print the testing classification report.

In [24]:
# Create and save the testing classification report
testing_report = classification_report(y_test, testing_predictions)

# Print the testing classification report
print(testing_report)

              precision    recall  f1-score   support

           0       0.99      0.98      0.99       273
           1       0.84      0.90      0.87        30

    accuracy                           0.97       303
   macro avg       0.92      0.94      0.93       303
weighted avg       0.97      0.97      0.97       303



## Step 9: Answer the following question

**Question:** How does the performance of the training and test dataset compare?

**Sample Answer:** Looking at the two classification reports for the training and test data, it looks as if model performance declined--albeit slightly--on the test data. This is to be expected: this is how well the model is performing on data that the model hasn't seen before. If we're still getting strong precision and recall on the test dataset, this is a good indication about how well the model is likely to perform in real life.

Training data usually performs slightly better than testing data. That's expected.