In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score
def method(x):
    s = ""
    l = []
    for i in x:
        i = i.split(" ")
        l.append("".join(i))
    return " ".join(l)

# Step 1: Data Preprocessing
df = pd.read_csv("cleaned_train.csv")
df['KEYWORDS'] = df['KEYWORDS'].apply(lambda x: eval(x))
df["KEYWORDS"] = df["KEYWORDS"].apply(method)

# Step 2: Feature Engineering
data = pd.DataFrame(df[["DEVICETYPE", "PLATFORMTYPE", "URL", "KEYWORDS", "IS_HCP"]])
counter = dict(data.URL.value_counts())
data["URL"] = data["URL"].apply(lambda x: "others" if counter[x] < 5 else x)

# Step 3: Splitting the Data
train_ratio = 0.8
X_train, X_test, y_train, y_test = train_test_split(
    data.drop("IS_HCP", axis=1), data["IS_HCP"], train_size=train_ratio, stratify=data["IS_HCP"]
)

# Step 4: Column Transformation
# Create a column transformer to apply different transformations to different columns
column_transformer = ColumnTransformer(
    [
        ("onehot", OneHotEncoder(handle_unknown='ignore'), ["DEVICETYPE", "PLATFORMTYPE", "URL"],),
        ("text_vectorizer", CountVectorizer(), "KEYWORDS")  # Add the columns to be scaled
    ],
    remainder="passthrough"
)

X_train_transformed = column_transformer.fit_transform(X_train)
X_test_transformed = column_transformer.transform(X_test)

# Step 5: Model Training and Step 6: Model Evaluation
models = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(),
    "SVM": SVC(),
    "AdaBoost": AdaBoostClassifier()
}
for name, model in models.items():
    model.fit(X_train_transformed, y_train)
    y_pred = model.predict(X_test_transformed)
    acc = accuracy_score(y_test, y_pred)
    print(f"{name}: {acc}")
# model = RandomForestClassifier()
# model.fit(X_train_transformed, y_train)
# y_pred = model.predict(X_test_transformed)
# acc = accuracy_score(y_test, y_pred)
# print(acc)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression: 0.9266300311896628
Random Forest: 0.9694044259616813
SVM: 0.9472746175553245
AdaBoost: 0.8280112876875093


In [3]:
# Step 7: Load and Preprocess Test Data
test_df = pd.read_csv("cleaned_test.csv")
test_df['KEYWORDS'] = test_df['KEYWORDS'].apply(lambda x: eval(x))
test_df["KEYWORDS"] = test_df["KEYWORDS"].apply(method)
test_data = pd.DataFrame(test_df[["DEVICETYPE", "PLATFORMTYPE", "URL", "KEYWORDS"]])
test_data["URL"] = test_data["URL"].apply(lambda x: "others" if counter.get(x, 0) < 5 else x)

# Step 8: Apply Column Transformation to Test Data
test_transformed = column_transformer.transform(test_data)

# Step 9: Make Predictions on Test Data
y_pred_test = model.predict(test_transformed)

In [4]:
test_df["IS_HCP"] = y_pred_test

In [5]:
test_df["IS_HCP"].value_counts()

IS_HCP
0.0    17015
1.0    11478
Name: count, dtype: int64

In [6]:
df["IS_HCP"].value_counts()

IS_HCP
0.0    18715
1.0    14949
Name: count, dtype: int64

In [7]:
test_df

Unnamed: 0,ID,DEVICETYPE,PLATFORM_ID,BIDREQUESTIP,USERPLATFORMUID,USERCITY,USERZIPCODE,USERAGENT,PLATFORMTYPE,CHANNELTYPE,URL,KEYWORDS,IS_HCP
0,115501,Desktop,2,75.189.231.103,0d5041ff-f0b6-4d1a-9ad7-0a29f7d485b4,Fayetteville,28305.0,Macintosh,Online Medical Journal,Website,www.clinicaladvisor.com,familypractice drainage clinical dermatology o...,0.0
1,115502,Mobile,2,24.101.33.158,c8396dd0-969f-4d99-a40b-b7bb1f516154,Conneaut Lake,16316.0,iPhone,Online Medical Journal,Website,www.ophthalmologyadvisor.com,general clinical operative medicine cardiology...,1.0
2,115503,Desktop,2,172.118.216.142,3c97a081-6518-43f8-9f26-369759cfb471,Covina,91724.0,Macintosh,Online Medical Journal,Website,www.psychiatryadvisor.com,abortion anxietydisorders apnea false trauma m...,0.0
3,115504,Desktop,7,71.105.120.171,3e2578c8-f794-41af-a38c-c5cfb3c0f014,Brooklyn,11226.0,Macintosh,Online Medical Journal,Website,www.cureus.com,health male neurologicalsurgery otolaryngology...,0.0
4,115505,Desktop,2,73.82.211.73,ec2ae7ce-6a8c-4156-98a7-07203e60f483,Marietta,30062.0,Windows,Online Medical Journal,Website,www.renalandurologynews.com,chronickidneydisease pain nephrology disease h...,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
28488,143989,Desktop,2,69.202.233.241,78ce4bbe-3885-4c14-b945-a0ea1e4574f4,Brooklyn,11215.0,Windows,Online Medical Journal,Website,www.renalandurologynews.com,transplantation psychiatry heartfailure angiot...,1.0
28489,143990,Desktop,7,75.4.190.65,a53799e1-b279-40cf-b467-11b4d659e325,Miami,33178.0,Windows,Online Medical Journal,Website,www.cureus.com,small hospitals nephrology emergencymedicine r...,0.0
28490,143991,Desktop,7,137.52.180.45,acc732a8-2ee9-4e2c-9933-70e34db48101,Fort Lauderdale,33314.0,Macintosh,Online Medical Journal,Website,www.cureus.com,small hospitals nephrology emergencymedicine r...,0.0
28491,143992,Desktop,8,66.249.66.4,dce21294-b105-4abb-b145-4e62d71def44,New York,63169.0,Linux,Online Medical Journal,Website,radrounds.com,the disease pain radiology lungabscess dyspare...,1.0


In [12]:
new = test_df[["ID", "IS_HCP"]]
# convert hcp column to int
new["IS_HCP"] = new["IS_HCP"].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new["IS_HCP"] = new["IS_HCP"].astype(int)


In [13]:
new.to_csv("submission.csv", index=False)