In [1]:
input_folder = "llm-detect-ai-generated-text"

In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk(input_folder):
    for filename in filenames:
        print(os.path.join(dirname, filename))

llm-detect-ai-generated-text/train_essays.csv
llm-detect-ai-generated-text/train_prompts.csv
llm-detect-ai-generated-text/test_essays.csv
llm-detect-ai-generated-text/sample_submission.csv


Load data

In [4]:
train_data = pd.read_csv(f"{input_folder}/train_essays.csv")
train_data.tail()

Unnamed: 0,id,prompt_id,text,generated
1373,fe6ff9a5,1,There has been a fuss about the Elector Colleg...,0
1374,ff669174,0,Limiting car usage has many advantages. Such a...,0
1375,ffa247e0,0,There's a new trend that has been developing f...,0
1376,ffc237e9,0,As we all know cars are a big part of our soci...,0
1377,ffe1ca0d,0,Cars have been around since the 1800's and hav...,0


In [6]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1378 entries, 0 to 1377
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   id         1378 non-null   object
 1   prompt_id  1378 non-null   int64 
 2   text       1378 non-null   object
 3   generated  1378 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 43.2+ KB


In [31]:
len(train_data.iloc[0, 2])

3289

We got the average character in the essay, it's around 3200

In [30]:
sum_length = 0
count = 0
for i in range(train_data.shape[0]):
    sum_length = sum_length + len(train_data.iloc[i, 2])
    count = count + 1
print(sum_length/count)

3169.0507982583454


There is only 3 generated from LLM

In [5]:
train_data.generated.sum()

3

We got something about features: 
1. Lots of features: 79
2. Features with lots of NaN, should drop: Alley, MasVnrType, FireplaceQu, PoolQC, Fence, MiscFeature
3. Nonsense features should drop: Id


In [30]:
features = list(test_data.columns)
len(features)

80

In [31]:
drop_features = ["Alley", "MasVnrType", "FireplaceQu", "PoolQC", "Fence", "MiscFeature"]

In [32]:
used_features = [i for i in features if i not in drop_features]
len(used_features)

74

In [33]:
test_data.shape

(1459, 80)

In [34]:
test_data = test_data[used_features]
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 74 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1459 non-null   int64  
 1   MSSubClass     1459 non-null   int64  
 2   MSZoning       1455 non-null   object 
 3   LotFrontage    1232 non-null   float64
 4   LotArea        1459 non-null   int64  
 5   Street         1459 non-null   object 
 6   LotShape       1459 non-null   object 
 7   LandContour    1459 non-null   object 
 8   Utilities      1457 non-null   object 
 9   LotConfig      1459 non-null   object 
 10  LandSlope      1459 non-null   object 
 11  Neighborhood   1459 non-null   object 
 12  Condition1     1459 non-null   object 
 13  Condition2     1459 non-null   object 
 14  BldgType       1459 non-null   object 
 15  HouseStyle     1459 non-null   object 
 16  OverallQual    1459 non-null   int64  
 17  OverallCond    1459 non-null   int64  
 18  YearBuil

## Ensemble model: DecisionTreeClassifier

We need to dummies object features: Sex, Cabin, Embarked

In [35]:
object_f = list(test_data.select_dtypes(include="object").columns)
len(object_f)

37

In [36]:
non_object_f = [i for i in used_features if i not in object_f]
len(non_object_f)

37

In [37]:
def preprocess_data(data, object_f, non_object_f):
    object_data = pd.get_dummies(data[object_f])
    non_object_data = data[non_object_f]
    return pd.concat([object_data, non_object_data], axis=1)
    

In [38]:
from sklearn.tree import DecisionTreeRegressor

y = train_data["SalePrice"]

X = preprocess_data(train_data, object_f, non_object_f)
X_test = preprocess_data(test_data, object_f, non_object_f)

In [39]:
model_features = [i for i in X.columns if i in X_test.columns]
X = X[model_features]
X_test = X_test[model_features]

In [40]:
X.columns

Index(['MSZoning_C (all)', 'MSZoning_FV', 'MSZoning_RH', 'MSZoning_RL',
       'MSZoning_RM', 'Street_Grvl', 'Street_Pave', 'LotShape_IR1',
       'LotShape_IR2', 'LotShape_IR3',
       ...
       'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch',
       'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold'],
      dtype='object', length=251)

In [41]:
clf = DecisionTreeRegressor(random_state=0)
clf.fit(X, y)

In [44]:
predictions = clf.predict(X_test)

output = pd.DataFrame({'Id': test_data.Id, 'SalePrice': predictions})
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!
