### We're going to predict the "keyword" for a course from its "Titles"

In [1]:
# Import required modules
import pandas as pd
import re
from nltk.corpus import stopwords
from collections import Counter

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
import pickle

# Download stopwords
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
# Read in the input file
df = pd.read_csv("courseWithKeyword.csv").drop("Unnamed: 3", axis=1)
df = df.dropna()
df.head()

Unnamed: 0,Titles,Exact Keyword,broadKeyword
0,Proofreading & copy editing course (Level 4 Di...,proofreading,copy editing
1,Public relations course (Level 4 Diploma),public relations,public relation
2,Learn Arabic,arabic,language
3,Learn French includes MP3 Downloads,french,language
4,Learn German,german,language


In [3]:
# Cleaning and processing

# "[^a-zA-Z0-9]" replaces all except words and digits
# "(\s\d*)" replaces digits if it has at least 1 space before it.
# "(\s{2,})" replaces at least 2 or more digits with a single space
df.Titles = df.Titles.str.replace("[^a-zA-Z0-9]", " ").str.replace("(\s\d*)", " ").str.replace("(\s{2,})", " ")

# Lowecase titles
df.Titles = df.Titles.str.lower().str.strip()

In [4]:
# Stopwords
STOPWORDS = set(stopwords.words("english"))

# We are searching in a dictionary rather than a set which is basically a hashmap. And in hashmap the search time is O(1)
STOPWORDS = Counter(STOPWORDS)

# Remove stopwords 
df.Titles = df.Titles.apply(lambda x: [item for item in x.split() if item not in STOPWORDS]).str.join(" ")

In [5]:
# Extract feature vector and response vector
X = df.Titles
y = df["broadKeyword"]


# Split train and test data
X_train, X_test, y_train, y_test = train_test_split(df.Titles, df.broadKeyword, shuffle=True, test_size=0.2, random_state=43)

In [6]:
# Make model pipeline
def make_pipeline(clf):
    model = Pipeline([
        ("countvec", CountVectorizer()),
        ("tfidf", TfidfTransformer()),
        ("clf", clf)])
    
    model.fit(X_train, y_train)
    with open("LogisticRegressionModel.pickle", "wb") as f:
        pickle.dump(model, f)

    # Make prediction on test set
    y_pred = model.predict(X_test)
    
    pred_df = pd.DataFrame({
        "y_test":y_test,
        "pred":y_pred
    })

    metric_df = pd.DataFrame(classification_report(pred_df.y_test, pred_df.pred, output_dict=True)).T.drop("support", axis=1).reset_index()
    metric_df.columns = ["category", "precision", "recall", "f1-score"]
    return metric_df, pred_df

In [7]:
# Use logistic regression model
model = LogisticRegression()
metric_df, pred_df = make_pipeline(model)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [8]:
# See the classification report
metric_df

Unnamed: 0,category,precision,recall,f1-score
0,3ds max,1.000000,1.000000,1.000000
1,TEFL,0.000000,0.000000,0.000000
2,ableton live,1.000000,1.000000,1.000000
3,abrasive wheels,1.000000,1.000000,1.000000
4,account management,1.000000,1.000000,1.000000
...,...,...,...,...
576,youth worker,0.000000,0.000000,0.000000
577,zoology,0.000000,0.000000,0.000000
578,accuracy,0.759113,0.759113,0.759113
579,macro avg,0.542530,0.510465,0.509252


In [9]:
# Check how many correct prediction the model has made
print(f"{pred_df[pred_df.y_test==pred_df.pred].shape[0]} correct predictions out of {pred_df.shape[0]} records")

2978 correct predictions out of 3923 records


In [10]:
# Read the saved model again and make prediction
model = pd.read_pickle("LogisticRegressionModel.pickle")
model.predict(["dog walking"])[0]

'pet training'