In [1]:
import pandas as pd
from lightwood.api.high_level import (
    ProblemDefinition,
    json_ai_from_problem,
    code_from_json_ai,
    predictor_from_code,
)

# Load a pandas dataset
df = pd.read_csv(
    "./pulsar_data_train.csv.zip"
)

realtestdata_df = pd.read_csv("./pulsar_data_test.csv")

df.dropna(inplace=True)

# Define the prediction task by naming the target column
pdef = ProblemDefinition.from_dict(
    {
        "target": "target_class",  # column you want to predict
        "unbias_target": True
    }
)

# Generate JSON-AI code to model the problem
json_ai = json_ai_from_problem(df, problem_definition=pdef)

# OPTIONAL - see the JSON-AI syntax
print(json_ai.to_json())



[32mINFO:lightwood-672:Dropping features: [][0m
[32mINFO:lightwood-672:Analyzing a sample of 6305[0m
[32mINFO:lightwood-672:from a total population of 9273, this is equivalent to 68.0% of your data.[0m
[32mINFO:lightwood-672:Infering type for:  Mean of the integrated profile[0m
[32mINFO:lightwood-672:Column  Mean of the integrated profile has data type float[0m
[32mINFO:lightwood-672:Infering type for:  Standard deviation of the integrated profile[0m
[32mINFO:lightwood-672:Column  Standard deviation of the integrated profile has data type float[0m
[32mINFO:lightwood-672:Infering type for:  Excess kurtosis of the integrated profile[0m
[32mINFO:lightwood-672:Column  Excess kurtosis of the integrated profile has data type float[0m
[32mINFO:lightwood-672:Infering type for:  Skewness of the integrated profile[0m
[32mINFO:lightwood-672:Column  Skewness of the integrated profile has data type float[0m
[32mINFO:lightwood-672:Infering type for:  Mean of the DM-SNR curve[

{
    "features": {
        " Mean of the integrated profile": {
            "encoder": {
                "module": "NumericEncoder",
                "args": {}
            },
            "data_dtype": "float"
        },
        " Standard deviation of the integrated profile": {
            "encoder": {
                "module": "NumericEncoder",
                "args": {}
            },
            "data_dtype": "float"
        },
        " Excess kurtosis of the integrated profile": {
            "encoder": {
                "module": "NumericEncoder",
                "args": {}
            },
            "data_dtype": "float"
        },
        " Skewness of the integrated profile": {
            "encoder": {
                "module": "NumericEncoder",
                "args": {}
            },
            "data_dtype": "float"
        },
        " Mean of the DM-SNR curve": {
            "encoder": {
                "module": "NumericEncoder",
                "args": {}
            

In [2]:
# Generate python code
code = code_from_json_ai(json_ai)

# OPTIONAL - see generated code
#print(code)

# Create a predictor from python code
predictor = predictor_from_code(code)

[32mINFO:lightwood-672:Unable to import black formatter, predictor code might be a bit ugly.[0m


In [3]:
# Train a model end-to-end from raw data to a finalized predictor
predictor.learn(df)

# Make the train/test splits and show predictions for a few examples
test_df = predictor.split(predictor.preprocess(df))["test"]

[32mINFO:lightwood-672:Dropping features: [][0m
[32mINFO:lightwood-672:Performing statistical analysis on data[0m
[32mINFO:lightwood-672:Starting statistical analysis[0m
[32mINFO:lightwood-672:Finished statistical analysis[0m
[32mINFO:lightwood-672:Cleaning the data[0m
[32mINFO:lightwood-672:Splitting the data into train/test[0m
[32mINFO:lightwood-672:Preparing the encoders[0m
[32mINFO:lightwood-672:Encoder prepping dict length of: 1[0m
[32mINFO:lightwood-672:Encoder prepping dict length of: 2[0m
[32mINFO:lightwood-672:Encoder prepping dict length of: 3[0m
[32mINFO:lightwood-672:Encoder prepping dict length of: 4[0m
[32mINFO:lightwood-672:Encoder prepping dict length of: 5[0m
[32mINFO:lightwood-672:Encoder prepping dict length of: 6[0m
[32mINFO:lightwood-672:Encoder prepping dict length of: 7[0m
[32mINFO:lightwood-672:Encoder prepping dict length of: 8[0m
[32mINFO:lightwood-672:Encoder prepping dict length of: 9[0m
[32mINFO:lightwood-672:Done running fo

In [5]:
preds = predictor.predict(realtestdata_df)

print(preds)

[32mINFO:lightwood-3164:Dropping features: [][0m
[32mINFO:lightwood-3164:Cleaning the data[0m
[32mINFO:lightwood-3164:Featurizing the data[0m
[32mINFO:lightwood-3164:The block ICP is now running its explain() method[0m
[32mINFO:lightwood-3164:The block AccStats is now running its explain() method[0m
[32mINFO:lightwood-3164:AccStats.explain() has not been implemented, no modifications will be done to the data insights.[0m


      original_index prediction  __mdb_proba_0.0  __mdb_proba_1.0  confidence
0                  0        0.0        -2.599891         3.599891    0.553191
1                  1        1.0       -14.581289        15.581289    0.968431
2                  2        0.0        -1.393124         2.393124    0.319149
3                  3        1.0         1.371195        -0.371195    0.148936
4                  4        0.0        -0.763157         1.763157    0.202128
...              ...        ...              ...              ...         ...
5365            5365        0.0        -4.569973         5.569973    0.851064
5366            5366        0.0        -1.754324         2.754324    0.393617
5367            5367        0.0        -0.871799         1.871799    0.212766
5368            5368        0.0        -2.236572         3.236572    0.489362
5369            5369        0.0        -0.990976         1.990976    0.234043

[5370 rows x 5 columns]


In [4]:
# Save predictor for later use

predictor.save("./pulsars_model")

In [6]:
preds.to_csv("test_predictations.csv")