In [29]:
# Run some setup code for this notebook.
import pandas as pd

# Some more magic so that the notebook will reload external python modules;
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [30]:
def load_data(path):
    df = pd.read_json(path)
    df_expanded = df["user"].apply(lambda x: pd.Series(x))
    df = pd.concat([df.drop("user", axis=1), df_expanded], axis=1)
    return df


df = load_data("./data/train.json")
df_to_fill = load_data("./data/test.json")

df.head()

Unnamed: 0,created_at,label,id,id_str,name,screen_name,location,description,url,entities,...,profile_sidebar_fill_color,profile_text_color,profile_use_background_image,has_extended_profile,default_profile,default_profile_image,following,follow_request_sent,notifications,translator_type
0,2019-01-20 03:36:01+00:00,bot,1318284638,1318284638,Syed Haider Naqvi,HNakvi,Islamabad,,,{'description': {'urls': []}},...,DDEEF6,333333,True,False,True,False,False,False,False,none
1,2019-01-21 21:36:00+00:00,bot,17442457,17442457,Cleveland Clinic MD,CleClinicMD,"Cleveland, Ohio, United States",News and insights for physicians and clinical ...,https://t.co/IxMra2OEey,{'url': {'urls': [{'url': 'https://t.co/IxMra2...,...,D6E9F6,333333,True,False,False,False,False,False,False,none
2,2016-10-11 13:40:17+00:00,bot,102069605,102069605,2NE1 TV,2NE1tv,Seoul,We tweet about 2NE1's video and photo. It’s 2N...,,{'description': {'urls': []}},...,E3E2DE,634047,True,False,False,False,False,False,False,none
3,2019-01-21 18:42:38+00:00,bot,1367759732,1367759732,Alastair de kretser,Alastairkretser,England,"Travel, tech and stuff 😀",,{'description': {'urls': []}},...,DDEEF6,333333,True,True,False,False,False,False,False,none
4,2018-11-13 04:09:04+00:00,bot,2927879268,2927879268,Gran Horóscopo,GranHoroscopo,México - Argentina,Tu mejor prediccion diaria ORIGINAL para todos...,https://t.co/rDVUTyCn9E,{'url': {'urls': [{'url': 'https://t.co/rDVUTy...,...,DDEEF6,333333,True,False,True,False,False,False,False,regular


### 数据预处理

In [31]:
def preprocess(d: pd.DataFrame):
    d.drop(["id", "id_str", "utc_offset", "time_zone"], axis=1, inplace=True)

    d.drop(
        [
            "name",
            "screen_name",
            "location",
            "description",
            "url",
            "entities",
            "profile_background_image_url",
            "profile_background_image_url_https",
            "profile_image_url",
            "profile_image_url_https",
            "profile_banner_url",
        ],
        axis=1,
        inplace=True,
    )

    old_columns = list(d.columns[1:])
    d.columns = ["created_at0"] + old_columns
    d["created_at"] = pd.to_datetime(d["created_at"], infer_datetime_format=True)

    def col2rgb(s: str):
        x = int(s, base=16)
        return pd.Series([x // (256 * 256), (x // 256) % 256, x % 256])

    for name in ["profile_background_color", "profile_link_color", "profile_sidebar_border_color", "profile_sidebar_fill_color", "profile_text_color"]:
        d_rgb = d[name].apply(col2rgb)
        d_rgb.columns = [name+"_r", name+"_g", name+"_b"]
        d = pd.concat([d.drop([name], axis=1), d_rgb], axis=1)

    d["lang"] = d["lang"].apply(str.lower)
    
    return d

df_label = pd.get_dummies(df["label"]).iloc[:, 0]
df.drop(["label"], axis=1, inplace=True)
df_to_fill.drop(["label"], axis=1, inplace=True)

def dummy2(a: pd.DataFrame, b: pd.DataFrame):
    N = len(a)
    concat_dummy = pd.get_dummies(pd.concat([a, b]))
    return concat_dummy[:N], concat_dummy[N:]

df = preprocess(df)
df_to_fill = preprocess(df_to_fill)

df, df_to_fill = dummy2(df, df_to_fill)

In [32]:
print(df.info())
print(df_to_fill.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1986 entries, 0 to 1985
Data columns (total 56 columns):
 #   Column                          Non-Null Count  Dtype              
---  ------                          --------------  -----              
 0   created_at0                     1986 non-null   datetime64[ns, UTC]
 1   protected                       1986 non-null   bool               
 2   followers_count                 1986 non-null   int64              
 3   friends_count                   1986 non-null   int64              
 4   listed_count                    1986 non-null   int64              
 5   created_at                      1986 non-null   datetime64[ns, UTC]
 6   favourites_count                1986 non-null   int64              
 7   geo_enabled                     1986 non-null   bool               
 8   verified                        1986 non-null   bool               
 9   statuses_count                  1986 non-null   int64              
 10  contributors

In [33]:
df.head()

Unnamed: 0,created_at0,protected,followers_count,friends_count,listed_count,created_at,favourites_count,geo_enabled,verified,statuses_count,...,lang_ko,lang_nl,lang_pl,lang_pt,lang_ru,lang_th,lang_tr,translator_type_badged,translator_type_none,translator_type_regular
0,2019-01-20 03:36:01+00:00,False,28533,1164,45,2013-03-31 12:21:09+00:00,6006,True,False,7446,...,0,0,0,0,0,0,0,0,1,0
1,2019-01-21 21:36:00+00:00,False,299192,1721,1517,2008-11-17 14:46:49+00:00,401,True,True,25344,...,0,0,0,0,0,0,0,0,1,0
2,2016-10-11 13:40:17+00:00,False,89051,743,307,2010-01-05 14:06:26+00:00,154,False,False,47479,...,1,0,0,0,0,0,0,0,1,0
3,2019-01-21 18:42:38+00:00,False,43211,237,29,2013-04-20 18:55:44+00:00,358,False,False,16586,...,0,0,0,0,0,0,0,0,1,0
4,2018-11-13 04:09:04+00:00,False,113300,16422,112,2014-12-13 00:20:53+00:00,37,True,False,70666,...,0,0,0,0,0,0,0,0,0,1


## 模型训练

In [36]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df, df_label)

X_train.head()

Unnamed: 0,created_at0,protected,followers_count,friends_count,listed_count,created_at,favourites_count,geo_enabled,verified,statuses_count,...,lang_ko,lang_nl,lang_pl,lang_pt,lang_ru,lang_th,lang_tr,translator_type_badged,translator_type_none,translator_type_regular
159,2018-08-06 10:21:00+00:00,False,143524,124696,22,2012-04-25 02:58:51+00:00,0,True,False,692814,...,0,0,0,0,0,0,0,0,0,1
1833,2019-01-21 22:48:19+00:00,False,76242949,103,88901,2010-06-14 19:09:20+00:00,0,True,True,3344,...,0,0,0,0,0,0,0,0,0,1
1229,2016-08-04 15:22:12+00:00,False,27100,0,9,2015-07-28 06:10:26+00:00,49,False,False,6276,...,0,0,0,0,0,0,0,0,1,0
885,2019-01-21 13:57:10+00:00,False,1122,646,39,2010-05-23 15:25:10+00:00,7671,True,False,71620,...,0,0,0,0,0,0,0,0,1,0
1074,2018-12-20 14:11:14+00:00,False,3,11,0,2018-09-11 12:32:40+00:00,3,False,False,7,...,0,0,0,0,0,0,0,0,1,0


### 线性模型

In [None]:
from sklearn.linear_model import LinearRegression, LogisticRegression

model_linear = LinearRegression()
model_linear.fit(X_train, y_train)

model_logistic = LogisticRegression()
model_logistic.fit(X_train, y_train)