In [1]:
import pandas as pd
import logging
import json
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error

#New text processing tecniques that I applied
from sklearn.feature_extraction.text import TfidfVectorizer

#Decision tree regressor
from sklearn.tree import DecisionTreeRegressor 

In [2]:
def main():
    logging.getLogger().setLevel(logging.INFO)
    logging.info("Loading training/test data")
    train = pd.DataFrame.from_records(json.load(open('train.json'))).fillna("")
    test = pd.DataFrame.from_records(json.load(open('test.json'))).fillna("")

    logging.info("Splitting validation")
    train, val = train_test_split(train, stratify=train['year'], random_state=123)
    #Adjusted the features by creating a for loop that runs through multiple atributes.
    feature_names=["title","abstract", "publisher"] 

    transformers=[(feature, TfidfVectorizer(), feature)for feature in feature_names]

    featurizer = ColumnTransformer(
        transformers=transformers,
        remainder='drop')
    dummy = make_pipeline(featurizer, DummyRegressor(strategy='mean'))
    #ridge = make_pipeline(featurizer, Ridge())
    tree = make_pipeline(featurizer, DecisionTreeRegressor()) #add decision tree model

    logging.info("Fitting models")
    dummy.fit(train.drop('year', axis=1), train['year'].values)
    #ridge.fit(train.drop('year', axis=1), train['year'].values)
    tree.fit(train.drop('year', axis=1), train['year'].values) #decision tree

    logging.info("Evaluating on validation data")
    err = mean_absolute_error(val['year'].values, dummy.predict(val.drop('year', axis=1)))
    logging.info(f"Mean baseline MAE: {err}")
    #err = mean_absolute_error(val['year'].values, ridge.predict(val.drop('year', axis=1)))
    #logging.info(f"Ridge regress MAE: {err}")
    err = mean_absolute_error(val['year'].values, tree.predict(val.drop('year', axis=1)))
    logging.info(f"Tree regress MAE: {err}")

    logging.info(f"Predicting on test")
    #pred = ridge.predict(test)
    pred = tree.predict(test)
    test['year'] = pred
    logging.info("Writing prediction file")
    test.to_json("predicted.json", orient='records', indent=2)
    
main()

INFO:root:Loading training/test data
INFO:root:Splitting validation
INFO:root:Fitting models
INFO:root:Evaluating on validation data
INFO:root:Mean baseline MAE: 7.8054390754858805
INFO:root:Tree regress MAE: 4.0916637981821475
INFO:root:Predicting on test
INFO:root:Writing prediction file


In [None]:

""""Predictions with CountVectorizer"""

#Prediction with feature= "title"
#INFO:root:Mean baseline MAE: 7.8054390754858805
#INFO:root:Ridge regress MAE: 5.8123259857253915

#Prediction with feature = "abstract"
#INFO:root:Mean baseline MAE: 7.8054390754858805
#INFO:root:Ridge regress MAE: 6.371295315649925

#Prediction with feature = "publisher"
#INFO:root:Mean baseline MAE: 7.8054390754858805
#INFO:root:Ridge regress MAE: 5.443128700037363

""""Predictions with TfidfVectorizer"""

#Prediction with feature= "title"
#INFO:root:Mean baseline MAE: 7.8054390754858805
#INFO:root:Ridge regress MAE: 5.387367490324335

#Prediction with feature = "publisher"
#INFO:root:Mean baseline MAE: 7.8054390754858805
#INFO:root:Ridge regress MAE: 5.444409508095158

#Prediction with features = "abstract", title and publisher and a decision tree
#INFO:root:Mean baseline MAE: 7.8054390754858805
#INFO:root:decision tree MAE: 4.031015478965557 !!!!!!!!!!!!!!!!

""""Decision tree and TfidfVectorizer"""
#Prediction with features = "abstract", title and publisher
# INFO:root:Mean baseline MAE: 7.8054390754858805
# INFO:root:Ridge regress MAE: 4.097044383573657




In [None]:
"""Best code so far,:
#Prediction with features = "abstract", title and publisher, decision tree and TfidVectorizer()
#INFO:root:Mean baseline MAE: 7.8054390754858805
#INFO:root:decision tree MAE: 4.031015478965557"""


def main():
    logging.getLogger().setLevel(logging.INFO)
    logging.info("Loading training/test data")
    train = pd.DataFrame.from_records(json.load(open('train.json'))).fillna("")
    test = pd.DataFrame.from_records(json.load(open('test.json'))).fillna("")

    logging.info("Splitting validation")
    train, val = train_test_split(train, stratify=train['year'], random_state=123)
    #Adjusted the features by creating a for loop that runs through multiple atributes.
    feature_names=["title","abstract", "publisher"] 

    transformers=[(feature, TfidfVectorizer(), feature)for feature in feature_names]

    featurizer = ColumnTransformer(
        transformers=transformers,
        remainder='drop')
    dummy = make_pipeline(featurizer, DummyRegressor(strategy='mean'))
    #ridge = make_pipeline(featurizer, Ridge())
    tree = make_pipeline(featurizer, DecisionTreeRegressor()) #add decision tree model

    logging.info("Fitting models")
    dummy.fit(train.drop('year', axis=1), train['year'].values)
    #ridge.fit(train.drop('year', axis=1), train['year'].values)
    tree.fit(train.drop('year', axis=1), train['year'].values) #decision tree

    logging.info("Evaluating on validation data")
    err = mean_absolute_error(val['year'].values, dummy.predict(val.drop('year', axis=1)))
    logging.info(f"Mean baseline MAE: {err}")
    #err = mean_absolute_error(val['year'].values, ridge.predict(val.drop('year', axis=1)))
    #logging.info(f"Ridge regress MAE: {err}")
    err = mean_absolute_error(val['year'].values, tree.predict(val.drop('year', axis=1)))
    logging.info(f"Tree regress MAE: {err}")

    logging.info(f"Predicting on test")
    #pred = ridge.predict(test)
    pred = tree.predict(test)
    test['year'] = pred
    logging.info("Writing prediction file")
    test.to_json("predicted.json", orient='records', indent=2)
    
main()