In [9]:
import pandas as pd
from scripts.extract_supported_versions import extract_supported_versions
from scripts.extract_frame_size import extract_frame_size

columns_to_keep = [
    "user_agent",
    "js_fingerprint.profile.vendor",
    "tls.extensions",
    "tls.tls_version_record",
    "tls.tls_version_negotiated",
    "tls.ja3",
    "tls.ja3_hash",
    "tls.peetprint_hash",
    "http2.akamai_fingerprint",
    "http2.sent_frames",
    "tcpip.ip.ttl",
    "tcpip.tcp.mss",
    "tcpip.tcp.window",
    "os_prediction.highest",
    "os_prediction.platform_mismatch"
]
# Очищаем датафрейм от лишних
df = pd.read_json('../raw_data/fingerprints_with_os_checked.json', lines=True)
raw_df = df.to_dict(orient='records')
df_exp = pd.json_normalize(raw_df, sep='_')

# Группировка по IP и проверка уникальных user_agents для каждого IP
conflicting_ips = df_exp.groupby('ip')['user_agent'].nunique()
conflicting_ips = conflicting_ips[conflicting_ips > 1]

# Для каждого IP, где есть несколько user_agent, оставим только первую запись
df_cleaned = df_exp[~df_exp['ip'].isin(conflicting_ips.index)]  # Убираем строки с конфликтующими IP
df_cleaned = pd.concat([df_cleaned, df_exp[df_exp['ip'].isin(conflicting_ips.index)].drop_duplicates(subset=['ip'])])

columns_to_keep_normalized = [col.replace(".", "_") for col in columns_to_keep]

data_final = df_cleaned[columns_to_keep_normalized]
data_final.loc[:, "tls_supported_versions"] = data_final['tls_extensions'].apply(extract_supported_versions)
data_final.drop(columns=['tls_extensions'], inplace=True)

display(data_final.head(10))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_final.loc[:, "tls_supported_versions"] = data_final['tls_extensions'].apply(extract_supported_versions)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_final.drop(columns=['tls_extensions'], inplace=True)


Unnamed: 0,user_agent,js_fingerprint_profile_vendor,tls_tls_version_record,tls_tls_version_negotiated,tls_ja3,tls_ja3_hash,tls_peetprint_hash,http2_akamai_fingerprint,http2_sent_frames,tcpip_ip_ttl,tcpip_tcp_mss,tcpip_tcp_window,os_prediction_highest,os_prediction_platform_mismatch,tls_supported_versions
0,Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109...,"[0, ]",771,772,"771,4865-4867-4866-49195-49199-52393-52392-491...",c279b0189edb9269da7bc43dea5e0c36,fea33d3783ae74d495240377b44baa68,"1:65536,4:131072,5:16384|12517377|3:0:0:201,5:...","[{'frame_type': 'SETTINGS', 'length': 18, 'set...",45.0,1460.0,64240.0,Linux,True,"[TLS 1.3, TLS 1.2]"
1,Mozilla/5.0 (Linux; Android 8.0.0; SM-G955U Bu...,"[0, Google Inc.]",771,772,"771,4865-4866-4867-49195-49199-49196-49200-523...",c9c72924ce9126d09d6688ee2c2b462b,8ad9325e12f531d2983b78860de7b0ec,"1:65536,2:0,4:6291456,6:262144|15663105|0|m,a,s,p","[{'frame_type': 'SETTINGS', 'length': 24, 'set...",46.0,1366.0,59220.0,Linux,True,"[TLS_GREASE (0x8a8a), TLS 1.3, TLS 1.2]"
2,Mozilla/5.0 (compatible; AhrefsBot/7.0; +http:...,"[0, Google Inc.]",771,772,"771,4865-4866-4867-49195-49199-49196-49200-523...",9f8f0d3c82a11de7092c04dabcc909ac,d3adf83b0154a3dce2a78ec296fcbfa4,"1:65536,2:0,4:6291456,6:262144|15663105|0|m,a,s,p","[{'frame_type': 'SETTINGS', 'length': 24, 'set...",58.0,1460.0,65535.0,Android,True,"[TLS_GREASE (0x5a5a), TLS 1.3, TLS 1.2]"
3,Mozilla/5.0 (iPhone; CPU iPhone OS 17_2 like M...,"[0, Apple Computer, Inc.]",771,772,"771,4865-4866-4867-49196-49195-52393-49200-491...",773906b0efdefa24a7f2b8eb6985bf37,b2bafdc69377086c3416be278fd21121,"2:0,4:2097152,3:100|10485760|0|m,s,p,a","[{'frame_type': 'SETTINGS', 'length': 18, 'set...",48.0,1412.0,65535.0,iOS,True,"[TLS_GREASE (0x3a3a), TLS 1.3, TLS 1.2, TLS 1...."
4,Mozilla/5.0 (compatible; AhrefsBot/7.0; +http:...,"[0, Google Inc.]",771,772,"771,4865-4866-4867-49195-49199-49196-49200-523...",043bb5082ab9efb57fa102f13a86a6e0,d3adf83b0154a3dce2a78ec296fcbfa4,"1:65536,2:0,4:6291456,6:262144|15663105|0|m,a,s,p","[{'frame_type': 'SETTINGS', 'length': 24, 'set...",58.0,1460.0,65535.0,Android,True,"[TLS_GREASE (0x5a5a), TLS 1.3, TLS 1.2]"
5,Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53...,"[0, Google Inc.]",771,772,"771,4865-4866-4867-49195-49199-49196-49200-523...",0d69ff451640d67ee8b5122752834766,052336488e42ad64eb9768f5bc03004a,"1:65536,3:1000,4:6291456,6:262144|15663105|0|m...","[{'frame_type': 'SETTINGS', 'length': 24, 'set...",55.0,1420.0,65320.0,Linux,True,"[TLS_GREASE (0x5a5a), TLS 1.3, TLS 1.2, TLS 1...."
6,Mozilla/5.0 (iPhone; CPU iPhone OS 17_2_1 like...,"[0, Apple Computer, Inc.]",771,772,"771,4865-4866-4867-49196-49195-52393-49200-491...",773906b0efdefa24a7f2b8eb6985bf37,b2bafdc69377086c3416be278fd21121,"2:0,4:2097152,3:100|10485760|0|m,s,p,a","[{'frame_type': 'SETTINGS', 'length': 18, 'set...",53.0,1460.0,65535.0,iOS,True,"[TLS_GREASE (0x5a5a), TLS 1.3, TLS 1.2, TLS 1...."
7,Mozilla/5.0 (iPhone; CPU iPhone OS 17_2_1 like...,"[0, Apple Computer, Inc.]",771,772,"771,4865-4866-4867-49196-49195-52393-49200-491...",773906b0efdefa24a7f2b8eb6985bf37,b2bafdc69377086c3416be278fd21121,"2:0,4:2097152,3:100|10485760|0|m,s,p,a","[{'frame_type': 'SETTINGS', 'length': 18, 'set...",53.0,1460.0,65535.0,iOS,True,"[TLS_GREASE (0x8a8a), TLS 1.3, TLS 1.2, TLS 1...."
8,Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53...,"[0, Google Inc.]",771,772,"771,4865-4866-4867-49195-49199-49196-49200-523...",cd08e31494f9531f560d64c695473da9,22a4f858cc83b9144c829ca411948a88,"1:65536,2:0,3:1000,4:6291456,6:262144|15663105...","[{'frame_type': 'SETTINGS', 'length': 30, 'set...",45.0,1460.0,64240.0,Linux,True,"[TLS_GREASE (0x1a1a), TLS 1.3, TLS 1.2]"
9,Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53...,"[0, Google Inc.]",771,772,"771,4865-4866-4867-49195-49199-49196-49200-523...",cd08e31494f9531f560d64c695473da9,22a4f858cc83b9144c829ca411948a88,"1:65536,2:0,3:1000,4:6291456,6:262144|15663105...","[{'frame_type': 'SETTINGS', 'length': 30, 'set...",45.0,1460.0,64240.0,Linux,True,"[TLS_GREASE (0x5a5a), TLS 1.3, TLS 1.2]"


In [10]:
df = data_final
print(df.shape)
print(df.isna().sum())

(6598, 15)
user_agent                            0
js_fingerprint_profile_vendor      2968
tls_tls_version_record                0
tls_tls_version_negotiated            0
tls_ja3                               0
tls_ja3_hash                          0
tls_peetprint_hash                    0
http2_akamai_fingerprint              0
http2_sent_frames                     0
tcpip_ip_ttl                          6
tcpip_tcp_mss                         6
tcpip_tcp_window                      6
os_prediction_highest                 0
os_prediction_platform_mismatch    2970
tls_supported_versions                5
dtype: int64


In [12]:
display(df[df['os_prediction_platform_mismatch'].isna()])

Unnamed: 0,user_agent,js_fingerprint_profile_vendor,tls_tls_version_record,tls_tls_version_negotiated,tls_ja3,tls_ja3_hash,tls_peetprint_hash,http2_akamai_fingerprint,http2_sent_frames,tcpip_ip_ttl,tcpip_tcp_mss,tcpip_tcp_window,os_prediction_highest,os_prediction_platform_mismatch,tls_supported_versions
732,Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53...,,771,772,"771,4865-4866-4867-49195-49199-49196-49200-523...",ab6d7c76b94901e222ccf068c5347452,8ad9325e12f531d2983b78860de7b0ec,"1:65536,2:0,3:1000,4:6291456,6:262144,:2765720...","[{'frame_type': 'SETTINGS', 'length': 36, 'set...",53.0,1332.0,64240.0,Linux,,"[TLS_GREASE (0x6a6a), TLS 1.3, TLS 1.2]"
733,Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53...,,771,772,"771,4865-4866-4867-49195-49199-49196-49200-523...",c8e74cfad72f4e7192e85b48aef8c387,8ad9325e12f531d2983b78860de7b0ec,"1:65536,2:0,3:1000,4:6291456,6:262144,:2554462...","[{'frame_type': 'SETTINGS', 'length': 36, 'set...",53.0,1332.0,64240.0,Linux,,"[TLS_GREASE (0x0a0a), TLS 1.3, TLS 1.2]"
734,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...,,771,772,"771,4865-4866-4867-49195-49199-49196-49200-523...",4eb607ec303ba63ee8194cd34ae328b7,b8ce945a4d9a7a9b5b6132e3658fe033,"1:65536,2:0,4:6291456,6:262144|15663105|0|m,a,s,p","[{'frame_type': 'SETTINGS', 'length': 24, 'set...",111.0,1460.0,64240.0,Windows,,"[TLS_GREASE (0xcaca), TLS 1.3, TLS 1.2]"
735,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...,,771,772,"771,4865-4866-4867-49195-49199-49196-49200-523...",b5634fa2d391896e24c2a70bc7e6d7eb,b8ce945a4d9a7a9b5b6132e3658fe033,"1:65536,2:0,4:6291456,6:262144|15663105|0|m,a,s,p","[{'frame_type': 'SETTINGS', 'length': 24, 'set...",114.0,1412.0,64240.0,Windows,,"[TLS_GREASE (0x0a0a), TLS 1.3, TLS 1.2]"
740,Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebK...,,771,772,"771,4865-4866-4867-49195-49199-49196-49200-523...",28d4a0fa4211f3c6286fe0f0655f6ffd,8ad9325e12f531d2983b78860de7b0ec,"1:65536,2:0,4:6291456,6:262144|15663105|0|m,a,s,p","[{'frame_type': 'SETTINGS', 'length': 24, 'set...",48.0,1460.0,42340.0,Android,,"[TLS_GREASE (0x2a2a), TLS 1.3, TLS 1.2]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6939,Mozilla/5.0 (iPhone; CPU iPhone OS 16_0 like M...,,771,772,"771,4865-4866-4867-49195-49199-49196-49200-523...",a11052b5ac6270650abdc43796c00540,b8ce945a4d9a7a9b5b6132e3658fe033,"1:65536,2:0,4:6291456,6:262144|15663105|0|m,a,s,p","[{'frame_type': 'SETTINGS', 'length': 24, 'set...",46.0,8910.0,35640.0,Android,,"[TLS_GREASE (0x6a6a), TLS 1.3, TLS 1.2]"
6968,Mozilla/5.0 (Linux; Android 10; K) AppleWebKit...,,771,772,"771,4865-4866-4867-49195-49199-49196-49200-523...",f7ccd19d2ef2820c08e01e139ab6a328,b8ce945a4d9a7a9b5b6132e3658fe033,"1:65536,2:0,4:6291456,6:262144|15663105|0|m,a,s,p","[{'frame_type': 'SETTINGS', 'length': 24, 'set...",57.0,1460.0,64240.0,Linux,,"[TLS_GREASE (0x0a0a), TLS 1.3, TLS 1.2]"
7223,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...,,771,772,"771,4865-4866-4867-49195-49199-49196-49200-523...",b324d948349b458ac2473c4667955e33,22a4f858cc83b9144c829ca411948a88,"1:65536,2:0,3:1000,4:6291456,6:262144|15663105...","[{'frame_type': 'SETTINGS', 'length': 30, 'set...",50.0,1460.0,64240.0,Android,,"[TLS_GREASE (0xaaaa), TLS 1.3, TLS 1.2]"
7240,Mozilla/5.0 (Linux; Android 10; K) AppleWebKit...,,771,772,"771,4865-4866-4867-49195-49199-49196-49200-523...",d537c837a85e43d9cdffac603ee2057a,b8ce945a4d9a7a9b5b6132e3658fe033,"1:65536,2:0,4:6291456,6:262144|15663105|0|m,a,s,p","[{'frame_type': 'SETTINGS', 'length': 24, 'set...",108.0,1440.0,64240.0,Windows,,"[TLS_GREASE (0xfafa), TLS 1.3, TLS 1.2]"


In [175]:
print(df['js_fingerprint_profile_vendor'].value_counts())

js_fingerprint_profile_vendor
[0, Google Inc.]             3026
[0, Apple Computer, Inc.]     306
[0, ]                         298
Name: count, dtype: int64


In [176]:
# Извлечение первого элемента из каждого массива в столбце
df.loc[:, 'js_fingerprint_profile_vendor'] = df['js_fingerprint_profile_vendor'].apply(lambda x: x[1] if isinstance(x, list) else x)
display(df.head(3))

Unnamed: 0,user_agent,js_fingerprint_profile_vendor,tls_tls_version_record,tls_tls_version_negotiated,tls_ja3,tls_ja3_hash,tls_peetprint_hash,http2_akamai_fingerprint,http2_sent_frames,tcpip_ip_ttl,tcpip_tcp_mss,tcpip_tcp_window,os_prediction_highest,os_prediction_platform_mismatch,tls_supported_versions
0,Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109...,,771,772,"771,4865-4867-4866-49195-49199-52393-52392-491...",c279b0189edb9269da7bc43dea5e0c36,fea33d3783ae74d495240377b44baa68,"1:65536,4:131072,5:16384|12517377|3:0:0:201,5:...","[{'frame_type': 'SETTINGS', 'length': 18, 'set...",45.0,1460.0,64240.0,Linux,True,"[TLS 1.3, TLS 1.2]"
1,Mozilla/5.0 (Linux; Android 8.0.0; SM-G955U Bu...,Google Inc.,771,772,"771,4865-4866-4867-49195-49199-49196-49200-523...",c9c72924ce9126d09d6688ee2c2b462b,8ad9325e12f531d2983b78860de7b0ec,"1:65536,2:0,4:6291456,6:262144|15663105|0|m,a,s,p","[{'frame_type': 'SETTINGS', 'length': 24, 'set...",46.0,1366.0,59220.0,Linux,True,"[TLS_GREASE (0x8a8a), TLS 1.3, TLS 1.2]"
2,Mozilla/5.0 (compatible; AhrefsBot/7.0; +http:...,Google Inc.,771,772,"771,4865-4866-4867-49195-49199-49196-49200-523...",9f8f0d3c82a11de7092c04dabcc909ac,d3adf83b0154a3dce2a78ec296fcbfa4,"1:65536,2:0,4:6291456,6:262144|15663105|0|m,a,s,p","[{'frame_type': 'SETTINGS', 'length': 24, 'set...",58.0,1460.0,65535.0,Android,True,"[TLS_GREASE (0x5a5a), TLS 1.3, TLS 1.2]"


In [177]:
print(df.isna().sum())
df.loc[:, 'http2_sent_frames'] = df['http2_sent_frames'].apply(extract_frame_size)

user_agent                            0
js_fingerprint_profile_vendor      2968
tls_tls_version_record                0
tls_tls_version_negotiated            0
tls_ja3                               0
tls_ja3_hash                          0
tls_peetprint_hash                    0
http2_akamai_fingerprint              0
http2_sent_frames                     0
tcpip_ip_ttl                          6
tcpip_tcp_mss                         6
tcpip_tcp_window                      6
os_prediction_highest                 0
os_prediction_platform_mismatch    2970
tls_supported_versions                5
dtype: int64


In [178]:
display(df[df['tls_supported_versions'].isna()])
display(df[df['tcpip_tcp_window'].isna()])
display(df[df['tcpip_tcp_mss'].isna()])

Unnamed: 0,user_agent,js_fingerprint_profile_vendor,tls_tls_version_record,tls_tls_version_negotiated,tls_ja3,tls_ja3_hash,tls_peetprint_hash,http2_akamai_fingerprint,http2_sent_frames,tcpip_ip_ttl,tcpip_tcp_mss,tcpip_tcp_window,os_prediction_highest,os_prediction_platform_mismatch,tls_supported_versions
190,Mozilla/4.0 (compatible; MSIE 7.0; Windows NT ...,Google Inc.,771,771,"771,49195-49199-49196-49200-52393-52392-49171-...",5e573c9c9f8ba720ef9b18e9fce2e2f7,6e75e96b93bf1cdb1f00953ea9e6899a,"1:65536,3:1000,4:6291456|15663105|0|m,a,s,p","[HEADER_TABLE_SIZE = 65536, MAX_CONCURRENT_STR...",51.0,1460.0,29200.0,Linux,True,
191,Mozilla/4.0 (compatible; MSIE 7.0; Windows NT ...,Google Inc.,771,771,"771,49195-49199-49196-49200-52393-52392-49171-...",5e573c9c9f8ba720ef9b18e9fce2e2f7,6e75e96b93bf1cdb1f00953ea9e6899a,"1:65536,3:1000,4:6291456|15663105|0|m,a,s,p","[HEADER_TABLE_SIZE = 65536, MAX_CONCURRENT_STR...",51.0,1460.0,29200.0,Linux,True,
793,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...,,771,771,"771,49195-49199-49196-49200-52393-52392-158-15...",85f47a058bd9c3d189fa4ff442d39156,8aeebdbc08e7fe03c9916939c4e2bb15,"1:4096,2:0,4:65535,5:16384,:0,3:100,6:65536|00...","[HEADER_TABLE_SIZE = 4096, ENABLE_PUSH = 0, IN...",52.0,1311.0,42340.0,Android,,
3459,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...,,771,771,"771,49195-49199-49196-49200-49171-49172-156-15...",a3663749e98327bbc4b46c7307112627,acc2084bf8cf6ebd622c1e3a0bad731f,"4:104857600|00|0|m,a,s,p",[INITIAL_WINDOW_SIZE = 104857600],113.0,1380.0,64240.0,Windows,,
3460,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...,Google Inc.,771,771,"771,49195-49199-49196-49200-49171-49172-156-15...",3f17367e26fcfb6e38d2667af1c9abc2,a6845dc34d5400eb23bac8bab304c01a,"4:104857600|00|0|m,a,s,p",[INITIAL_WINDOW_SIZE = 104857600],113.0,1380.0,64240.0,Windows,True,


Unnamed: 0,user_agent,js_fingerprint_profile_vendor,tls_tls_version_record,tls_tls_version_negotiated,tls_ja3,tls_ja3_hash,tls_peetprint_hash,http2_akamai_fingerprint,http2_sent_frames,tcpip_ip_ttl,tcpip_tcp_mss,tcpip_tcp_window,os_prediction_highest,os_prediction_platform_mismatch,tls_supported_versions
868,curl/7.81.0,,771,772,"771,4866-4867-4865-49196-49200-159-52393-52392...",4ea056e63b7910cbf543f0c095064dfe,551ea4c38754f1085d2b16b947b868ef,"3:100,4:33554432,2:0|33488897|0|m,p,s,a","[MAX_CONCURRENT_STREAMS = 100, INITIAL_WINDOW_...",,,,iOS,,"[TLS 1.3, TLS 1.2]"
869,curl/7.81.0,,771,772,"771,4866-4867-4865-49196-49200-159-52393-52392...",4ea056e63b7910cbf543f0c095064dfe,551ea4c38754f1085d2b16b947b868ef,"3:100,4:33554432,2:0|33488897|0|m,p,s,a","[MAX_CONCURRENT_STREAMS = 100, INITIAL_WINDOW_...",,,,iOS,,"[TLS 1.3, TLS 1.2]"
871,curl/7.81.0,,771,772,"771,4866-4867-4865-49196-49200-159-52393-52392...",4ea056e63b7910cbf543f0c095064dfe,551ea4c38754f1085d2b16b947b868ef,"3:100,4:33554432,2:0|33488897|0|m,p,s,a","[MAX_CONCURRENT_STREAMS = 100, INITIAL_WINDOW_...",,,,iOS,,"[TLS 1.3, TLS 1.2]"
872,curl/7.81.0,,771,772,"771,4866-4867-4865-49196-49200-159-52393-52392...",4ea056e63b7910cbf543f0c095064dfe,551ea4c38754f1085d2b16b947b868ef,"3:100,4:33554432,2:0|33488897|0|m,p,s,a","[MAX_CONCURRENT_STREAMS = 100, INITIAL_WINDOW_...",,,,iOS,,"[TLS 1.3, TLS 1.2]"
6490,Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53...,Google Inc.,771,772,"771,4865-4866-4867-49195-49199-49196-49200-523...",0ae18052c288c1bd39910255598ed827,67037cd3eea4cb139a4e1b47bd5d77d6,"1:65536,3:1000,4:6291456,6:262144|15663105|0|m...","[HEADER_TABLE_SIZE = 65536, MAX_CONCURRENT_STR...",,,,iOS,True,"[TLS_GREASE (0x8a8a), TLS 1.3, TLS 1.2, TLS 1...."
7363,Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53...,Google Inc.,771,772,"771,4865-4866-4867-49195-49199-49196-49200-523...",0ae18052c288c1bd39910255598ed827,67037cd3eea4cb139a4e1b47bd5d77d6,"1:65536,3:1000,4:6291456,6:262144|15663105|0|m...","[HEADER_TABLE_SIZE = 65536, MAX_CONCURRENT_STR...",,,,iOS,True,"[TLS_GREASE (0xfafa), TLS 1.3, TLS 1.2, TLS 1...."


Unnamed: 0,user_agent,js_fingerprint_profile_vendor,tls_tls_version_record,tls_tls_version_negotiated,tls_ja3,tls_ja3_hash,tls_peetprint_hash,http2_akamai_fingerprint,http2_sent_frames,tcpip_ip_ttl,tcpip_tcp_mss,tcpip_tcp_window,os_prediction_highest,os_prediction_platform_mismatch,tls_supported_versions
868,curl/7.81.0,,771,772,"771,4866-4867-4865-49196-49200-159-52393-52392...",4ea056e63b7910cbf543f0c095064dfe,551ea4c38754f1085d2b16b947b868ef,"3:100,4:33554432,2:0|33488897|0|m,p,s,a","[MAX_CONCURRENT_STREAMS = 100, INITIAL_WINDOW_...",,,,iOS,,"[TLS 1.3, TLS 1.2]"
869,curl/7.81.0,,771,772,"771,4866-4867-4865-49196-49200-159-52393-52392...",4ea056e63b7910cbf543f0c095064dfe,551ea4c38754f1085d2b16b947b868ef,"3:100,4:33554432,2:0|33488897|0|m,p,s,a","[MAX_CONCURRENT_STREAMS = 100, INITIAL_WINDOW_...",,,,iOS,,"[TLS 1.3, TLS 1.2]"
871,curl/7.81.0,,771,772,"771,4866-4867-4865-49196-49200-159-52393-52392...",4ea056e63b7910cbf543f0c095064dfe,551ea4c38754f1085d2b16b947b868ef,"3:100,4:33554432,2:0|33488897|0|m,p,s,a","[MAX_CONCURRENT_STREAMS = 100, INITIAL_WINDOW_...",,,,iOS,,"[TLS 1.3, TLS 1.2]"
872,curl/7.81.0,,771,772,"771,4866-4867-4865-49196-49200-159-52393-52392...",4ea056e63b7910cbf543f0c095064dfe,551ea4c38754f1085d2b16b947b868ef,"3:100,4:33554432,2:0|33488897|0|m,p,s,a","[MAX_CONCURRENT_STREAMS = 100, INITIAL_WINDOW_...",,,,iOS,,"[TLS 1.3, TLS 1.2]"
6490,Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53...,Google Inc.,771,772,"771,4865-4866-4867-49195-49199-49196-49200-523...",0ae18052c288c1bd39910255598ed827,67037cd3eea4cb139a4e1b47bd5d77d6,"1:65536,3:1000,4:6291456,6:262144|15663105|0|m...","[HEADER_TABLE_SIZE = 65536, MAX_CONCURRENT_STR...",,,,iOS,True,"[TLS_GREASE (0x8a8a), TLS 1.3, TLS 1.2, TLS 1...."
7363,Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53...,Google Inc.,771,772,"771,4865-4866-4867-49195-49199-49196-49200-523...",0ae18052c288c1bd39910255598ed827,67037cd3eea4cb139a4e1b47bd5d77d6,"1:65536,3:1000,4:6291456,6:262144|15663105|0|m...","[HEADER_TABLE_SIZE = 65536, MAX_CONCURRENT_STR...",,,,iOS,True,"[TLS_GREASE (0xfafa), TLS 1.3, TLS 1.2, TLS 1...."


In [179]:
display(df.head(5))

Unnamed: 0,user_agent,js_fingerprint_profile_vendor,tls_tls_version_record,tls_tls_version_negotiated,tls_ja3,tls_ja3_hash,tls_peetprint_hash,http2_akamai_fingerprint,http2_sent_frames,tcpip_ip_ttl,tcpip_tcp_mss,tcpip_tcp_window,os_prediction_highest,os_prediction_platform_mismatch,tls_supported_versions
0,Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109...,,771,772,"771,4865-4867-4866-49195-49199-52393-52392-491...",c279b0189edb9269da7bc43dea5e0c36,fea33d3783ae74d495240377b44baa68,"1:65536,4:131072,5:16384|12517377|3:0:0:201,5:...","[HEADER_TABLE_SIZE = 65536, INITIAL_WINDOW_SIZ...",45.0,1460.0,64240.0,Linux,True,"[TLS 1.3, TLS 1.2]"
1,Mozilla/5.0 (Linux; Android 8.0.0; SM-G955U Bu...,Google Inc.,771,772,"771,4865-4866-4867-49195-49199-49196-49200-523...",c9c72924ce9126d09d6688ee2c2b462b,8ad9325e12f531d2983b78860de7b0ec,"1:65536,2:0,4:6291456,6:262144|15663105|0|m,a,s,p","[HEADER_TABLE_SIZE = 65536, ENABLE_PUSH = 0, I...",46.0,1366.0,59220.0,Linux,True,"[TLS_GREASE (0x8a8a), TLS 1.3, TLS 1.2]"
2,Mozilla/5.0 (compatible; AhrefsBot/7.0; +http:...,Google Inc.,771,772,"771,4865-4866-4867-49195-49199-49196-49200-523...",9f8f0d3c82a11de7092c04dabcc909ac,d3adf83b0154a3dce2a78ec296fcbfa4,"1:65536,2:0,4:6291456,6:262144|15663105|0|m,a,s,p","[HEADER_TABLE_SIZE = 65536, ENABLE_PUSH = 0, I...",58.0,1460.0,65535.0,Android,True,"[TLS_GREASE (0x5a5a), TLS 1.3, TLS 1.2]"
3,Mozilla/5.0 (iPhone; CPU iPhone OS 17_2 like M...,"Apple Computer, Inc.",771,772,"771,4865-4866-4867-49196-49195-52393-49200-491...",773906b0efdefa24a7f2b8eb6985bf37,b2bafdc69377086c3416be278fd21121,"2:0,4:2097152,3:100|10485760|0|m,s,p,a","[ENABLE_PUSH = 0, INITIAL_WINDOW_SIZE = 209715...",48.0,1412.0,65535.0,iOS,True,"[TLS_GREASE (0x3a3a), TLS 1.3, TLS 1.2, TLS 1...."
4,Mozilla/5.0 (compatible; AhrefsBot/7.0; +http:...,Google Inc.,771,772,"771,4865-4866-4867-49195-49199-49196-49200-523...",043bb5082ab9efb57fa102f13a86a6e0,d3adf83b0154a3dce2a78ec296fcbfa4,"1:65536,2:0,4:6291456,6:262144|15663105|0|m,a,s,p","[HEADER_TABLE_SIZE = 65536, ENABLE_PUSH = 0, I...",58.0,1460.0,65535.0,Android,True,"[TLS_GREASE (0x5a5a), TLS 1.3, TLS 1.2]"


In [180]:
df = df.copy()
df.dropna(inplace= True)
print(df.isna().sum())
print(df.shape)

user_agent                         0
js_fingerprint_profile_vendor      0
tls_tls_version_record             0
tls_tls_version_negotiated         0
tls_ja3                            0
tls_ja3_hash                       0
tls_peetprint_hash                 0
http2_akamai_fingerprint           0
http2_sent_frames                  0
tcpip_ip_ttl                       0
tcpip_tcp_mss                      0
tcpip_tcp_window                   0
os_prediction_highest              0
os_prediction_platform_mismatch    0
tls_supported_versions             0
dtype: int64
(3623, 15)


In [181]:
columns_str = [
    "user_agent",
    "js_fingerprint_profile_vendor",
    "tls_ja3",
    "tls_ja3_hash",
    "tls_peetprint_hash",
    "http2_akamai_fingerprint",
    "os_prediction_highest"
]
columns_numeric = [
    "tls_tls_version_record",
    "tls_tls_version_negotiated"
]

df[columns_numeric] = df[columns_numeric].apply(pd.to_numeric)
print(df.dtypes)

user_agent                          object
js_fingerprint_profile_vendor       object
tls_tls_version_record               int64
tls_tls_version_negotiated           int64
tls_ja3                             object
tls_ja3_hash                        object
tls_peetprint_hash                  object
http2_akamai_fingerprint            object
http2_sent_frames                   object
tcpip_ip_ttl                       float64
tcpip_tcp_mss                      float64
tcpip_tcp_window                   float64
os_prediction_highest               object
os_prediction_platform_mismatch     object
tls_supported_versions              object
dtype: object


In [182]:
display(df[df.loc[:, "os_prediction_platform_mismatch"] == 0])

Unnamed: 0,user_agent,js_fingerprint_profile_vendor,tls_tls_version_record,tls_tls_version_negotiated,tls_ja3,tls_ja3_hash,tls_peetprint_hash,http2_akamai_fingerprint,http2_sent_frames,tcpip_ip_ttl,tcpip_tcp_mss,tcpip_tcp_window,os_prediction_highest,os_prediction_platform_mismatch,tls_supported_versions
193,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...,Google Inc.,771,772,"771,4865-4866-4867-49195-49199-49196-49200-523...",0d69ff451640d67ee8b5122752834766,a0468212376b470d0ee57e6922986be3,"1:65536,3:1000,4:6291456,6:262144|15663105|0|m...","[HEADER_TABLE_SIZE = 65536, MAX_CONCURRENT_STR...",56.0,1460.0,29200.0,Windows,False,"[TLS_GREASE (0x1a1a), TLS 1.3, TLS 1.2]"


In [183]:
df.drop(index=193, inplace = True)

In [184]:
display(df.head(10))
df.to_csv('../clean_data/notebooks.csv')
df.to_json('../clean_data/notebooks.json')

Unnamed: 0,user_agent,js_fingerprint_profile_vendor,tls_tls_version_record,tls_tls_version_negotiated,tls_ja3,tls_ja3_hash,tls_peetprint_hash,http2_akamai_fingerprint,http2_sent_frames,tcpip_ip_ttl,tcpip_tcp_mss,tcpip_tcp_window,os_prediction_highest,os_prediction_platform_mismatch,tls_supported_versions
0,Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109...,,771,772,"771,4865-4867-4866-49195-49199-52393-52392-491...",c279b0189edb9269da7bc43dea5e0c36,fea33d3783ae74d495240377b44baa68,"1:65536,4:131072,5:16384|12517377|3:0:0:201,5:...","[HEADER_TABLE_SIZE = 65536, INITIAL_WINDOW_SIZ...",45.0,1460.0,64240.0,Linux,True,"[TLS 1.3, TLS 1.2]"
1,Mozilla/5.0 (Linux; Android 8.0.0; SM-G955U Bu...,Google Inc.,771,772,"771,4865-4866-4867-49195-49199-49196-49200-523...",c9c72924ce9126d09d6688ee2c2b462b,8ad9325e12f531d2983b78860de7b0ec,"1:65536,2:0,4:6291456,6:262144|15663105|0|m,a,s,p","[HEADER_TABLE_SIZE = 65536, ENABLE_PUSH = 0, I...",46.0,1366.0,59220.0,Linux,True,"[TLS_GREASE (0x8a8a), TLS 1.3, TLS 1.2]"
2,Mozilla/5.0 (compatible; AhrefsBot/7.0; +http:...,Google Inc.,771,772,"771,4865-4866-4867-49195-49199-49196-49200-523...",9f8f0d3c82a11de7092c04dabcc909ac,d3adf83b0154a3dce2a78ec296fcbfa4,"1:65536,2:0,4:6291456,6:262144|15663105|0|m,a,s,p","[HEADER_TABLE_SIZE = 65536, ENABLE_PUSH = 0, I...",58.0,1460.0,65535.0,Android,True,"[TLS_GREASE (0x5a5a), TLS 1.3, TLS 1.2]"
3,Mozilla/5.0 (iPhone; CPU iPhone OS 17_2 like M...,"Apple Computer, Inc.",771,772,"771,4865-4866-4867-49196-49195-52393-49200-491...",773906b0efdefa24a7f2b8eb6985bf37,b2bafdc69377086c3416be278fd21121,"2:0,4:2097152,3:100|10485760|0|m,s,p,a","[ENABLE_PUSH = 0, INITIAL_WINDOW_SIZE = 209715...",48.0,1412.0,65535.0,iOS,True,"[TLS_GREASE (0x3a3a), TLS 1.3, TLS 1.2, TLS 1...."
4,Mozilla/5.0 (compatible; AhrefsBot/7.0; +http:...,Google Inc.,771,772,"771,4865-4866-4867-49195-49199-49196-49200-523...",043bb5082ab9efb57fa102f13a86a6e0,d3adf83b0154a3dce2a78ec296fcbfa4,"1:65536,2:0,4:6291456,6:262144|15663105|0|m,a,s,p","[HEADER_TABLE_SIZE = 65536, ENABLE_PUSH = 0, I...",58.0,1460.0,65535.0,Android,True,"[TLS_GREASE (0x5a5a), TLS 1.3, TLS 1.2]"
5,Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53...,Google Inc.,771,772,"771,4865-4866-4867-49195-49199-49196-49200-523...",0d69ff451640d67ee8b5122752834766,052336488e42ad64eb9768f5bc03004a,"1:65536,3:1000,4:6291456,6:262144|15663105|0|m...","[HEADER_TABLE_SIZE = 65536, MAX_CONCURRENT_STR...",55.0,1420.0,65320.0,Linux,True,"[TLS_GREASE (0x5a5a), TLS 1.3, TLS 1.2, TLS 1...."
6,Mozilla/5.0 (iPhone; CPU iPhone OS 17_2_1 like...,"Apple Computer, Inc.",771,772,"771,4865-4866-4867-49196-49195-52393-49200-491...",773906b0efdefa24a7f2b8eb6985bf37,b2bafdc69377086c3416be278fd21121,"2:0,4:2097152,3:100|10485760|0|m,s,p,a","[ENABLE_PUSH = 0, INITIAL_WINDOW_SIZE = 209715...",53.0,1460.0,65535.0,iOS,True,"[TLS_GREASE (0x5a5a), TLS 1.3, TLS 1.2, TLS 1...."
7,Mozilla/5.0 (iPhone; CPU iPhone OS 17_2_1 like...,"Apple Computer, Inc.",771,772,"771,4865-4866-4867-49196-49195-52393-49200-491...",773906b0efdefa24a7f2b8eb6985bf37,b2bafdc69377086c3416be278fd21121,"2:0,4:2097152,3:100|10485760|0|m,s,p,a","[ENABLE_PUSH = 0, INITIAL_WINDOW_SIZE = 209715...",53.0,1460.0,65535.0,iOS,True,"[TLS_GREASE (0x8a8a), TLS 1.3, TLS 1.2, TLS 1...."
8,Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53...,Google Inc.,771,772,"771,4865-4866-4867-49195-49199-49196-49200-523...",cd08e31494f9531f560d64c695473da9,22a4f858cc83b9144c829ca411948a88,"1:65536,2:0,3:1000,4:6291456,6:262144|15663105...","[HEADER_TABLE_SIZE = 65536, ENABLE_PUSH = 0, M...",45.0,1460.0,64240.0,Linux,True,"[TLS_GREASE (0x1a1a), TLS 1.3, TLS 1.2]"
9,Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53...,Google Inc.,771,772,"771,4865-4866-4867-49195-49199-49196-49200-523...",cd08e31494f9531f560d64c695473da9,22a4f858cc83b9144c829ca411948a88,"1:65536,2:0,3:1000,4:6291456,6:262144|15663105...","[HEADER_TABLE_SIZE = 65536, ENABLE_PUSH = 0, M...",45.0,1460.0,64240.0,Linux,True,"[TLS_GREASE (0x5a5a), TLS 1.3, TLS 1.2]"
