*Eduardus Tjitrahardja | @edutjie | 2022*

# Ristek Datathon 2022: Extract

## Importing Libraries

In [1]:
import pandas as pd
import numpy as np

import nltk
from tqdm import tqdm, notebook
from gensim.models.fasttext import load_facebook_model

from keras.applications.vgg19 import preprocess_input as vgg19_preprocess_input, VGG19
from keras.applications.densenet import preprocess_input as densenet_preprocess_input, DenseNet121
import keras.utils as image
from keras import layers
from keras.models import Model
import keras.backend as K

In [3]:
combined_df = pd.read_csv('dataset/processed_combined.csv')
combined_df.head()

Unnamed: 0,created_at,id,user_id,user_name,url,text,media,label
0,2020-02-23 04:06:39+00:00,1231430140824973313,133931409,r0b1 sur1a (黄玉春),https://twitter.com/R0b1Sur1a/status/123143014...,akhir juga terobos banjir,ERbrKgFU4AAnADb.jpg,0.0
1,2020-01-05 01:46:46+00:00,1213637932411580417,253063316,Beradaptasi di Era Pandemi ☀️,https://twitter.com/MarikaRahman_/status/12136...,kemekes ri idi banten ibi banten ppni banten t...,ENe1HUVUEAANLFT.jpg,0.0
2,2020-01-18 06:22:09+00:00,1218418277946396673,64318803,rywyu,https://twitter.com/rywyu/status/1218418277946...,cikini rada banjir tdi pagi hingga gk tau apa ...,EOiw80RU0AcJ4CI.jpg,0.0
3,2020-02-22 23:38:00+00:00,1231362534717837313,17383917,ICALIZERS,https://twitter.com/icalizers/status/123136253...,sperma tt kala warga jakarta sedang prihatin b...,ERatrt9UYAAtLMI.jpg,0.0
4,2019-12-17 10:54:31+00:00,1206890412574568449,3102973556,AN,https://twitter.com/lokbin103/status/120689041...,kuis jakarta banjir parah hari tanya siapa yg ...,EL-8Z-vUcAIiIlV.jpg,0.0


In [4]:
train_df = combined_df[combined_df['label'].notnull()]
test_df = combined_df[combined_df['label'].isnull() & combined_df['media'].notnull()]
unlabeled_df = combined_df[combined_df['label'].isnull() & combined_df['media'].isnull()]

train_df.shape, test_df.shape, unlabeled_df.shape, combined_df.shape

((1518, 8), (1011, 8), (9936, 8), (12465, 8))

## Image Feature

In [2]:
# extract images features from media column using vgg19
def extract_features(df, model, preprocess_input):
    img_size = 256
    batch_size = 16
    n_batches = df.shape[0] // batch_size + 1
    features = {}
    for b in notebook.tqdm(range(n_batches)):
        start = b * batch_size
        end = (b + 1) * batch_size
        batch = df[start:end]
        images = []
        for image_path in batch["media"]:
            img = image.load_img(f"dataset/media/Image/{image_path}", target_size=(img_size, img_size))
            x = image.img_to_array(img)
            x = np.expand_dims(x, axis=0)
            x = preprocess_input(x)
            images.append(x)
        images = np.vstack(images)
        batch_features = model.predict(images)
        for i, index in enumerate(batch.index):
            features[index] = batch_features[i]
    return pd.DataFrame.from_dict(features, orient="index")

### VGG19

In [41]:
inp = layers.Input((256,256,3))
backbone = VGG19(weights='imagenet', include_top=False, input_tensor=inp)
x = backbone.output
x = layers.GlobalAveragePooling2D()(x)
x = layers.Lambda(lambda x: K.expand_dims(x,axis = -1))(x)
x = layers.AveragePooling1D(4)(x)
out = layers.Lambda(lambda x: x[:,:,0])(x)

vgg19_model = Model(inp,out)

In [47]:
train_df.media.nunique()

1517

In [50]:
features = extract_features(train_df, vgg19_model)
features.head()

  0%|          | 0/95 [00:00<?, ?it/s]



Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,118,119,120,121,122,123,124,125,126,127
0,1.352288,1.014886,4.034522,1.031563,1.12494,3.010315,1.683836,3.008744,2.046914,0.221159,...,3.672439,0.474019,5.825624,3.236522,1.618736,4.117425,1.665372,1.647556,6.883621,2.747848
1,0.221165,1.658107,2.771854,0.703428,0.42601,1.664204,0.666496,3.533314,1.088824,0.427827,...,2.604076,0.569781,6.766608,8.160769,1.653374,0.417267,1.075007,1.685622,1.218619,9.190394
2,0.594882,0.389002,0.036172,0.690939,0.326817,0.175458,0.505132,0.500877,2.187993,0.631999,...,6.489656,0.766638,0.17208,0.646592,0.048913,1.525172,0.504017,0.04437,1.049962,1.501683
3,0.0,0.895361,0.0,0.651359,1.416523,0.107897,0.850757,0.260024,0.465739,0.678235,...,0.531128,0.364114,1.135422,0.112131,0.040652,6.959232,0.675484,0.198752,0.724027,0.399903
4,0.443059,1.767194,1.864106,0.997791,0.572493,2.994671,1.046537,3.473873,1.164435,0.041928,...,0.479829,1.308075,3.34397,5.646572,0.818348,1.267661,0.796877,0.237935,0.379691,2.857185


In [53]:
features.to_csv('dataset/image_features/train_image_features.csv', index=False)

In [51]:
features.shape

(1518, 128)

In [56]:
test_image_features = extract_features(test_df, vgg19_model)
test_image_features.head()

  0%|          | 0/64 [00:00<?, ?it/s]



Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,118,119,120,121,122,123,124,125,126,127
1518,0.187742,0.919194,0.536657,0.872991,0.699801,2.547535,1.368634,8.072812,1.832673,0.044823,...,1.125442,1.634206,2.846357,5.086789,1.339693,4.428577,1.174664,3.413805,1.214123,3.468926
1519,1.081535,0.343502,2.172968,3.097183,0.528741,3.39051,1.852074,6.226005,0.748891,0.453571,...,3.770203,2.613887,1.067632,4.110927,1.913855,4.505924,0.618563,2.125443,3.565013,9.079924
1520,0.594911,0.568306,2.661523,0.81891,1.096238,1.279988,3.859164,2.37581,0.347734,0.16199,...,2.462411,0.690099,2.405854,2.778926,1.044922,1.572516,0.743471,1.847108,1.778436,4.328998
1521,0.008054,2.782911,0.393933,1.575088,2.862458,0.536409,3.931819,6.853976,1.273635,0.000917,...,0.70334,0.34136,4.508155,3.449048,0.313936,1.829808,0.160399,0.83664,1.102968,2.660489
1522,0.032812,0.500018,1.095476,1.8897,1.660288,0.763369,1.849987,1.180393,0.106265,0.0,...,0.766487,0.190774,4.851637,2.176356,0.138018,0.460158,0.102598,1.45981,0.488602,4.208575


In [None]:
test_image_features.to_csv('dataset/image_features/test_image_features.csv', index=False)

In [16]:
model = VGG19(weights='imagenet', include_top=False)
train_features = extract_features(train_df, model)

train_features



[array([[[[ 0.        ,  0.        ,  0.        , ...,  0.        ,
           17.842972  ,  0.        ],
          [ 0.        ,  0.        ,  0.        , ...,  0.        ,
           13.583393  ,  0.        ],
          [ 0.        ,  0.        ,  0.        , ...,  0.        ,
           13.533809  ,  0.        ],
          ...,
          [ 0.        ,  0.        ,  0.        , ...,  0.        ,
            0.        ,  0.        ],
          [ 0.        ,  0.        ,  0.        , ...,  0.        ,
            0.        ,  0.        ],
          [ 0.        ,  0.        ,  0.        , ...,  0.        ,
            4.5380044 ,  0.        ]],
 
         [[ 0.        ,  0.        ,  0.        , ...,  0.        ,
            9.064399  ,  0.        ],
          [ 0.        ,  0.        ,  0.        , ...,  0.        ,
           12.391933  ,  0.        ],
          [ 0.        ,  0.        ,  0.        , ...,  0.        ,
           18.707062  ,  0.        ],
          ...,
          [ 0

In [17]:
# store features in a npy file
np.save('dataset/image_features/train_img_features.npy', train_features)

In [33]:
# load features from npy file
train_features = np.load('dataset/image_features/train_img_features.npy')

In [27]:
# convert features to a dataframe
train_features = pd.DataFrame(train_features.reshape(train_features.shape[0], -1))
train_features.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,32758,32759,32760,32761,32762,32763,32764,32765,32766,32767
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.737864,0.0,0.0,...,23.985191,0.0,0.0,10.401962,0.0,0.0,20.480049,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,7.2044,0.0,0.0,0.0,0.0,...,6.726958,0.0,0.0,0.0,0.0,0.0,5.147472,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11.906753,0.0,0.0,...,0.0,0.0,6.184856,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### DenseNet

In [5]:
inp = layers.Input((256,256,3))
backbone = DenseNet121(input_tensor = inp, include_top = False)
x = backbone.output
x = layers.GlobalAveragePooling2D()(x)
x = layers.Lambda(lambda x: K.expand_dims(x,axis = -1))(x)
x = layers.AveragePooling1D(4)(x)
out = layers.Lambda(lambda x: x[:,:,0])(x)

densenet_model = Model(inp,out)

In [6]:
densenet_train_features = extract_features(train_df, densenet_model, densenet_preprocess_input)

  0%|          | 0/95 [00:00<?, ?it/s]



In [7]:
densenet_train_features.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255
0,0.002412,0.255406,0.073089,0.020818,0.021223,0.001798,0.008892,0.010197,0.070118,0.018849,0.054602,0.003118,0.007767,0.003931,0.005033,0.030503,0.003325,0.013837,0.002505,0.001028,0.006882,0.028269,0.376334,0.002161,0.302289,0.132432,0.131233,0.004029,0.00678,0.295648,0.031218,0.231937,0.073761,0.149151,0.685599,0.166865,0.094126,0.026044,0.48515,0.020452,...,1.077296,0.541376,0.310558,0.357693,0.960152,0.420221,0.82025,0.698216,0.328508,0.481655,0.681017,0.64882,0.446172,0.705287,0.393157,0.903478,0.458689,1.159489,0.773173,0.642449,0.654009,0.945144,0.611428,0.546996,1.166167,0.343443,0.68204,0.287275,0.924408,0.324298,0.599071,0.667941,0.591719,0.772846,1.053931,0.563559,0.788449,0.933903,0.325151,0.782075
1,0.003262,0.209445,0.076465,0.020601,0.013247,0.001746,0.016001,0.010862,0.054241,0.016908,0.127318,0.002016,0.008529,0.003211,0.00934,0.016651,0.002608,0.021475,0.006082,0.001568,0.012869,0.039862,0.18961,0.001769,0.288122,0.164018,0.162125,0.005139,0.004739,0.072695,0.025428,0.378732,0.052171,0.069489,0.592651,0.028577,0.076047,0.034446,0.202534,0.010123,...,1.472552,2.045764,2.049966,0.959433,0.446147,0.675068,1.578061,0.826457,0.919705,0.928836,1.794812,1.363443,1.316205,0.938822,0.613611,1.091481,0.523514,1.949266,1.001246,0.353425,0.825997,0.982316,1.054037,1.092642,2.452472,0.371411,1.621604,0.863987,0.975113,0.409731,0.857704,0.663097,0.489337,0.604398,1.065173,1.656941,0.733056,0.339387,1.417239,2.256201
2,0.001464,0.083154,0.018514,0.024711,0.028696,0.002615,0.012801,0.007457,0.061622,0.019426,0.531163,0.002015,0.022792,0.002975,0.012297,0.030024,0.00112,0.037323,0.003109,0.000999,0.067159,0.073493,0.191059,0.002937,0.308102,0.451751,0.13054,0.004431,0.007547,0.38624,0.02175,0.11415,0.064977,0.165569,0.311064,0.12685,0.020116,0.0497,0.620203,0.019988,...,0.808848,0.347138,0.749638,1.171897,0.42754,0.476863,0.754987,0.401689,0.420455,0.114153,0.468656,0.98644,0.269741,0.376187,0.591756,0.222332,0.340009,0.654976,0.153556,0.54302,0.429581,0.761339,0.584203,1.423015,1.31532,0.691079,0.2537,0.345497,0.592161,0.167207,0.253258,0.468199,2.21271,0.12556,0.116755,0.070009,0.830658,0.214892,0.245031,0.456718
3,0.001842,0.070153,0.027252,0.012801,0.121898,0.001826,0.005749,0.007936,0.151572,0.014932,1.261574,0.001957,0.00631,0.002818,0.005722,0.030206,0.000464,0.002036,0.006015,0.001003,0.080958,0.055146,0.096471,0.00199,0.187203,0.204195,0.143274,0.004215,0.011337,0.194522,0.022942,0.165259,0.045997,0.212786,0.375833,0.040881,0.018696,0.055869,0.944333,0.022008,...,0.237322,0.36456,0.278841,0.334744,0.138679,0.426946,0.464995,0.497435,0.162397,0.053571,0.14759,0.382102,0.199669,0.35448,0.113205,1.165658,0.352228,1.694784,0.75675,1.365574,0.151227,1.020301,0.460455,1.787605,0.38254,2.063496,0.112593,0.116079,0.584708,0.10722,0.292904,0.090261,0.205571,0.16741,0.276909,0.267048,0.239819,0.32836,0.129048,0.646415
4,0.003214,0.142908,0.028323,0.02238,0.038958,0.001857,0.010106,0.006874,0.074566,0.024453,0.093079,0.002279,0.008727,0.004549,0.010948,0.02207,0.003543,0.034039,0.003839,0.000839,0.014784,0.015331,0.305155,0.002505,0.433958,0.210699,0.096044,0.004717,0.007021,0.297335,0.032578,0.236778,0.023112,0.146323,0.700624,0.077255,0.049108,0.020476,0.329092,0.009557,...,1.620798,1.935214,1.150098,1.21595,0.649319,1.327638,2.421384,0.937088,0.741445,0.438148,0.876544,1.438296,0.758102,0.439231,1.58146,0.437564,0.266749,1.143415,2.450484,0.824531,0.853288,0.500224,0.693787,0.608633,1.637012,0.625875,0.62841,0.83915,0.480826,0.789766,1.00348,0.561325,0.628901,0.472665,1.572514,1.373152,0.611522,0.283688,0.72131,1.036716


In [8]:
densenet_train_features.shape

(1518, 256)

In [9]:
densenet_train_features.to_csv('dataset/image_features/densenet_train_image_features.csv', index=False)

In [10]:
densenet_test_features = extract_features(test_df, densenet_model, densenet_preprocess_input)
densenet_test_features.head()

  0%|          | 0/64 [00:00<?, ?it/s]



Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255
1518,0.004273,0.114928,0.039943,0.026622,0.012216,0.001952,0.017378,0.004255,0.101986,0.017381,0.33754,0.001731,0.006737,0.003461,0.011286,0.034461,0.003837,0.013144,0.005954,0.000803,0.028136,0.037675,0.527466,0.00194,0.242935,0.103736,0.044655,0.007276,0.006015,0.179862,0.042919,0.181077,0.076031,0.121651,0.783468,0.021375,0.020314,0.026419,0.455073,0.006771,...,0.605151,0.769404,0.800861,0.463301,0.965122,1.154929,1.218223,0.4603,0.547016,0.649968,0.836173,1.224956,1.285517,0.889094,1.324426,0.987554,0.379569,1.263609,1.470952,1.169997,1.042157,1.966746,0.623826,0.791781,1.760651,1.511271,0.753452,1.03318,0.491862,0.346975,1.24132,0.372313,0.54792,0.562308,2.485681,0.73739,0.359036,0.343682,0.835717,1.247456
1519,0.003102,0.130681,0.054041,0.024013,0.075155,0.00203,0.008358,0.005364,0.154639,0.009527,0.200628,0.003191,0.008956,0.005391,0.02484,0.0212,0.003243,0.028073,0.003376,0.001435,0.024785,0.040131,0.199838,0.00185,0.300007,0.047122,0.05341,0.002945,0.005937,0.121891,0.027567,0.599141,0.176135,0.085939,0.612939,0.034296,0.047559,0.017005,0.426931,0.007845,...,0.707754,0.50215,0.983475,1.436921,0.719037,1.482063,1.188385,0.824488,1.345031,0.928074,0.743711,0.722747,0.622355,1.003204,1.496412,0.945001,1.060874,1.337694,1.132964,0.74774,1.443558,2.475015,0.668797,0.907776,1.659535,0.984588,1.37138,1.188568,0.472617,1.814564,1.27881,0.538953,1.159766,0.47268,2.140003,0.699961,1.171531,0.914106,1.116877,1.50939
1520,0.002691,0.195683,0.083915,0.01874,0.061179,0.001514,0.009508,0.008178,0.075114,0.024499,0.082378,0.002067,0.005312,0.003959,0.017383,0.032293,0.002565,0.026689,0.008217,0.00144,0.015122,0.051452,0.282186,0.002905,0.177553,0.27634,0.27212,0.003688,0.005247,0.112266,0.043614,0.107789,0.118491,0.051162,0.354707,0.082714,0.08486,0.016167,0.683804,0.022756,...,1.139097,0.81213,1.09289,2.005987,0.812404,0.463237,0.844871,0.831783,0.997267,2.10459,0.888073,1.303354,0.646961,0.800184,0.715561,0.568551,0.657872,0.595233,1.052606,0.494545,0.563054,2.048487,2.92752,1.518807,1.398455,0.779025,0.760135,0.619648,1.070379,1.55115,0.778826,0.696251,0.897908,1.369396,1.12957,1.495082,1.240101,1.613291,0.852301,0.932952
1521,0.003613,0.062647,0.027166,0.031916,0.018145,0.002191,0.008871,0.003298,0.124661,0.023334,0.37775,0.001382,0.00796,0.003022,0.00988,0.027242,0.001574,0.012472,0.00567,0.001059,0.054395,0.049985,0.580545,0.002169,0.389356,0.065399,0.128662,0.005684,0.008788,0.238445,0.026341,0.193843,0.032819,0.11317,0.461684,0.011037,0.035654,0.040353,0.596244,0.006649,...,1.052722,1.352414,1.784063,0.81135,0.570373,1.067954,1.03879,0.700897,0.404665,0.442027,0.629144,0.766682,0.990604,0.628284,1.090413,0.561704,0.707371,1.98013,1.040145,1.510549,0.303032,1.049015,0.40755,1.422103,1.668487,1.182279,0.277,0.952806,0.566613,0.215388,0.849845,1.592561,0.615338,0.316275,1.556629,0.680496,1.338426,0.84266,1.123443,0.740326
1522,0.003338,0.099204,0.039586,0.013847,0.046931,0.002498,0.005564,0.002962,0.168099,0.020112,0.454958,0.001895,0.00641,0.00285,0.012906,0.031406,0.00132,0.005388,0.005088,0.001325,0.04042,0.074245,0.465909,0.003364,0.181719,0.247067,0.122773,0.006148,0.008716,0.186948,0.027464,0.216064,0.079157,0.140283,0.232284,0.004855,0.033787,0.054118,1.058192,0.010131,...,0.65694,2.179405,1.335888,0.621562,0.276284,1.079919,1.729514,0.806938,0.894218,0.68409,0.992058,0.70593,1.137508,0.784717,0.453547,0.816759,1.149475,1.984105,1.72445,0.771829,0.241674,1.07158,0.513718,1.927459,1.845822,2.543176,0.746546,1.490347,0.816343,0.149978,1.325975,1.481824,0.59761,0.557286,1.942663,1.176206,1.443429,0.441209,0.549615,0.989312


In [11]:
densenet_test_features.to_csv('dataset/image_features/densenet_test_image_features.csv', index=False)

## Textual Feature

In [4]:
word_tokens = [nltk.tokenize.word_tokenize(i) for i in tqdm(train_df['text'])]
word_tokens[0]

100%|██████████| 1518/1518 [00:00<00:00, 4152.15it/s]


['akhir', 'juga', 'terobos', 'banjir']

In [5]:
ft_model = load_facebook_model('model/cc.id.300.bin')

In [14]:
ft_model.wv["banjir"]

array([-7.83165824e-03, -4.73202355e-02,  8.81230012e-02,  8.53706971e-02,
        1.34003013e-01, -2.88168043e-02, -1.50032043e-02, -1.28381401e-02,
        5.83951641e-03, -7.91077912e-02,  7.72436708e-02, -5.45463264e-02,
       -6.02833517e-02,  7.32105458e-03, -6.14437237e-02, -7.42493570e-02,
        2.84109712e-02, -6.95406869e-02, -2.40010023e-02,  9.62492451e-02,
        7.90210217e-02, -2.65017096e-02,  3.28898728e-02,  9.60502476e-02,
       -6.41337782e-02, -1.67172346e-02,  2.82737613e-02,  1.23081505e-02,
       -2.35787332e-02,  6.74109980e-02,  9.71845388e-02, -1.78277344e-02,
       -2.19146591e-02,  4.87571731e-02,  5.07424362e-02, -9.26893950e-03,
        4.04309183e-02, -2.18150970e-02,  1.14140436e-02,  4.27799486e-03,
        2.23270766e-02, -5.68529665e-02, -5.59097528e-02,  1.95064135e-02,
        5.14670499e-02, -4.36659344e-02, -5.62395975e-02, -7.81683326e-02,
        5.62892668e-02, -8.08358751e-03,  1.12346984e-01, -3.24015543e-02,
       -1.49580063e-02, -

In [12]:
ft_model.wv.most_similar('banjir', topn=10)

[('banjir-banjir', 0.8044018149375916),
 ('banjir.', 0.758131742477417),
 ('Banjir', 0.747555673122406),
 ('banjir3', 0.7226011157035828),
 ('banjirnya', 0.7184047102928162),
 ('bajir', 0.710436999797821),
 ('Banjir-banjir', 0.6992826461791992),
 ('pascabanjir', 0.6976428627967834),
 ('banjirpun', 0.6711414456367493),
 ('bandang', 0.6662591695785522)]

In [17]:
embedded_words = [[ft_model.wv[word] for word in token] for token in tqdm(word_tokens)]
embedded_words_encoding = [np.mean(embedded_word, axis=0) for embedded_word in tqdm(embedded_words)]
df_embedding = pd.DataFrame(embedded_words_encoding)
df_embedding.head()

100%|██████████| 1518/1518 [00:00<00:00, 13412.56it/s]
100%|██████████| 1518/1518 [00:00<00:00, 41538.87it/s]


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
0,0.009883,-0.021286,0.025969,0.092511,0.028668,-0.02877,0.016315,-0.01051,0.008982,-0.0908,...,0.007333,0.013171,0.015966,-0.002016,0.032844,-0.006217,0.022417,-0.057426,0.017508,0.03385
1,0.034512,0.009235,0.018049,0.045457,0.006226,-0.075077,0.004485,-0.026857,-0.048104,-0.050542,...,-0.040443,-0.033196,-0.006029,0.014043,-0.015839,0.013577,0.005713,0.001407,-0.003548,0.020258
2,0.037173,-0.05029,0.011853,0.118136,-0.018496,-0.046303,0.029109,0.011791,-0.015547,-0.080154,...,-0.032494,-0.030817,-0.034458,0.015121,-0.031749,0.027591,-0.008889,-0.035597,-0.008871,0.061395
3,0.000976,-0.008979,0.025069,0.051203,0.021625,-0.046954,0.021365,0.008321,-0.029315,-0.015738,...,-0.010753,0.018024,-0.011522,-0.014646,0.006166,0.028416,0.047759,0.010205,-0.007385,0.051342
4,0.02913,0.04531,0.027381,0.040282,-0.047682,0.001377,0.01747,-0.008542,0.011416,-0.097448,...,-0.014846,-0.011152,-0.055743,0.01705,-0.014798,0.048946,0.032517,-0.016963,-0.012313,0.021128


In [19]:
def embedding(df, model):
    word_tokens = [nltk.tokenize.word_tokenize(i) for i in tqdm(df['text'])]
    embedded_words = [[model.wv[word] for word in token] for token in tqdm(word_tokens)]
    embedded_words_encoding = [np.mean(embedded_word, axis=0) for embedded_word in tqdm(embedded_words)]
    df_embedding = pd.DataFrame(embedded_words_encoding)
    return df_embedding

In [20]:
test_df_embedding = embedding(test_df, ft_model)
unlabeled_df_embedding = embedding(unlabeled_df, ft_model)

100%|██████████| 1011/1011 [00:00<00:00, 9491.52it/s]
100%|██████████| 1011/1011 [00:00<00:00, 6701.34it/s]
100%|██████████| 1011/1011 [00:00<00:00, 42104.63it/s]
100%|██████████| 9936/9936 [00:00<00:00, 10306.02it/s]
100%|██████████| 9936/9936 [00:01<00:00, 8556.23it/s]
100%|██████████| 9936/9936 [00:00<00:00, 45980.75it/s]


In [22]:
# export to csv
df_embedding.to_csv('dataset/text_features/train_text_features.csv', index=False)
test_df_embedding.to_csv('dataset/text_features/test_text_features.csv', index=False)
unlabeled_df_embedding.to_csv('dataset/text_features/unlabeled_text_features.csv', index=False)

In [36]:
df_embedding = pd.read_csv('dataset/text_features/train_text_features.csv')
test_df_embedding = pd.read_csv('dataset/text_features/test_text_features.csv')
unlabeled_df_embedding = pd.read_csv('dataset/text_features/unlabeled_text_features.csv')

df_embedding.shape, test_df_embedding.shape, unlabeled_df_embedding.shape

((1518, 300), (1011, 300), (9936, 300))