In [1]:
import os
import pandas as pd
import numpy as np
import json
import pickle
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression



In [2]:
# Read the JSON data from the file
with open('../data/training.json', 'r') as file:
    data_list = [json.loads(line) for line in file]
df = pd.json_normalize(data_list)
df.head()

Unnamed: 0,city,category,section,heading
0,,,,
1,newyork,cell-phones,for-sale,New batteries C-S2 for Blackberry 7100/7130/87...
2,newyork,cell-phones,for-sale,******* Brand New Original SAMSUNG GALAXY NO...
3,newyork,cell-phones,for-sale,SAMSUNG GALAXY SIII T-999 MARBLE WHITE T-MOBIL...
4,newyork,cell-phones,for-sale,Ipad mini 64gb 4g any sim unlock


In [3]:
# Drop NaN Value
df = df.dropna()

In [4]:
# Train the classification model
def train_model(file_path):
    with open(file_path, 'r') as file:
        data_list = [json.loads(line) for line in file]
    df = pd.json_normalize(data_list)
    
    
    X = df.heading
    y = df['category']

    X_train, X_test, y_train, y_test = train_test_split(X,y, random_state = 42, test_size = 0.2)
    X_train.shape, X_test.shape, y_train.shape, y_test.shape

    count_vect = CountVectorizer()
    X_train_counts = count_vect.fit_transform(X_train)
    tfidf_transformer = TfidfTransformer()
    X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
    

    # Save Model
    model_folder = '../model'
    os.makedirs(model_folder, exist_ok=True)  # Create the 'model' folder if it doesn't exist

    # Vectorizer model
    vec_file = os.path.join(model_folder, 'vectorizer.pickle')
    pickle.dump(count_vect, open(vec_file, 'wb'))
    

    # Logistic Regression Model
    model = LogisticRegression().fit(X_train_tfidf, y_train)
    mod_file = os.path.join(model_folder, 'predict.model')
    pickle.dump(model, open(mod_file, 'wb'))
    
    #Akurasi Model
    print('Akurasi Model')
    print(round(model.score((X_train_tfidf), y_train),3))
    
train_model('../data/training.json')

Akurasi Model
0.82


In [5]:
# Test Model
def classify_utterance(df):
    loaded_vectorizer = pickle.load(open('../model/vectorizer.pickle', 'rb'))
    loaded_model = pickle.load(open('../model/predict.model', 'rb'))
    print(loaded_model.predict(loaded_vectorizer.transform([df])))


# Load data from JSON file into a DataFrame
def load_data_from_json(file_path):
    with open(file_path, 'r') as file:
        data_list = [json.loads(line) for line in file]
    df = pd.json_normalize(data_list)
    df = df.dropna()
    df = df.head(10)
    return df

In [6]:
# Call the the Function of Test Model
json_file_path = '../data/test-model.json'
df = load_data_from_json(json_file_path)
result = df['heading'].apply(classify_utterance)

['video-games']
['wanted-housing']
['video-games']
['shared']
['therapeutic']
['shared']
['appliances']
['therapeutic']
['therapeutic']
['artists']
