# Nemek előrejelzése kapcsolatok alapján

In [1]:
# Itt deklaráljuk azokat a függvényeket, melyeket segítségként megkaptunk előre:

import pandas as pd
import numpy as np

COLUMNS_STR = """    user_id
    public
    completion_percentage
    gender
    region
    last_login
    registration
    AGE
    body
    I_am_working_in_field
    spoken_languages
    hobbies
    I_most_enjoy_good_food
    pets
    body_type
    my_eyesight
    eye_color
    hair_color
    hair_type
    completed_level_of_education
    favourite_color
    relation_to_smoking
    relation_to_alcohol
    sign_in_zodiac
    on_pokec_i_am_looking_for
    love_is_for_me
    relation_to_casual_sex
    my_partner_should_be
    marital_status
    children
    relation_to_children
    I_like_movies
    I_like_watching_movie
    I_like_music
    I_mostly_like_listening_to_music
    the_idea_of_good_evening
    I_like_specialties_from_kitchen
    fun
    I_am_going_to_concerts
    my_active_sports
    my_passive_sports
    profession
    I_like_books
    life_style
    music
    cars
    politics
    relationships
    art_culture
    hobbies_interests
    science_technologies
    computers_internet
    education
    sport
    movies
    travelling
    health
    companies_brands
    more"""
COLUMNS_LIST = [col.strip() for col in COLUMNS_STR.split("\n")]


np.random.seed(42)


def select_relevant_profiles(all_profiles):
    """Select relevant profiles
    criteria:
    * is public
    * region is selected region
    * AGE specified
    * GENDER SPECIFIED
    """
    public_condition = all_profiles["public"] == 1
    age_condition = all_profiles["AGE"] > 14
    gender_condition = all_profiles["gender"].isin([0, 1])
    return all_profiles.loc[public_condition & age_condition & gender_condition]


def select_relevant_edges(all_edges, selected_ids):
    """Select relevant edges for those profiles that are relevant"""
    source_condition = all_edges["source"].isin(selected_ids)
    sink_condition = all_edges["sink"].isin(selected_ids)
    return all_edges.loc[source_condition & sink_condition]


def convert_edges_to_undirected(edges):
    """Convert edges to undirected, and keep only mutual connections"""
    undirected_edges = (
        edges.assign(
            smaller_id=lambda df: df[["source", "sink"]].min(axis=1),
            greater_id=lambda df: df[["source", "sink"]].max(axis=1),
        )
        .groupby(["smaller_id", "greater_id"])
        .agg({"source": "count"})
    )
    print(undirected_edges["source"].value_counts())
    return (
        undirected_edges.loc[undirected_edges["source"] == 2]
        .drop("source", axis=1)
        .reset_index()
    )


def load_and_select_profiles_and_edges():
    """load and select relevant profiles, then filter and undirect edges"""
    print("loading profiles")
    # TODO: Add some functionality to only read a subset of the data!
    profiles = pd.read_csv(
        "soc-pokec-profiles.txt",
        sep="\t",
        names=COLUMNS_LIST,
        index_col=False,
        usecols=["user_id", "public", "gender", "AGE"],
    )
    print("loading edges")
    edges = pd.read_csv(
        "soc-pokec-relationships.txt", sep="\t", names=["source", "sink"]
    )
    selected_profiles = select_relevant_profiles(profiles)
    selected_ids = selected_profiles["user_id"].unique()
    selected_edges = select_relevant_edges(edges, selected_ids)

    undirected_edges = convert_edges_to_undirected(selected_edges)
    nodes_with_edges = set(undirected_edges["smaller_id"].unique()).union(
        undirected_edges["greater_id"].unique()
    )
    print(f"Selected profiles: {len(selected_profiles)}")
    print(f"Nodes with edges: {len(nodes_with_edges)}")
    selected_profiles = selected_profiles[
        selected_profiles["user_id"].isin(nodes_with_edges)
    ]
    selected_profiles["AGE"] = selected_profiles["AGE"].clip(upper=50)
    selected_profiles = remove_test_set_gender_and_age(selected_profiles)
    return selected_profiles, undirected_edges

def load_and_select_profiles_and_edges_resultcheck():
    """load and select relevant profiles, then filter and undirect edges"""
    print("loading profiles")
    # TODO: Add some functionality to only read a subset of the data!
    profiles = pd.read_csv(
        "soc-pokec-profiles.txt",
        sep="\t",
        names=COLUMNS_LIST,
        index_col=False,
        usecols=["user_id", "public", "gender", "region", "AGE"],
    )
    print("loading edges")
    edges = pd.read_csv(
        "soc-pokec-relationships.txt", sep="\t", names=["source", "sink"]
    )
    selected_profiles = select_relevant_profiles(profiles)
    selected_ids = selected_profiles["user_id"].unique()
    selected_edges = select_relevant_edges(edges, selected_ids)

    undirected_edges = convert_edges_to_undirected(selected_edges)
    nodes_with_edges = set(undirected_edges["smaller_id"].unique()).union(
        undirected_edges["greater_id"].unique()
    )
    print(f"Selected profiles: {len(selected_profiles)}")
    print(f"Nodes with edges: {len(nodes_with_edges)}")
    selected_profiles = selected_profiles[
        selected_profiles["user_id"].isin(nodes_with_edges)
    ]
    selected_profiles["AGE"] = selected_profiles["AGE"].clip(upper=50)
    return selected_profiles, undirected_edges


def remove_test_set_gender_and_age(nodes):
    """Remove the gender feature from a subset of the nodes for estimation"""
    # todo: the 40k  random can be adjusted if youre working with a subset
    test_profiles = np.random.choice(nodes["user_id"].unique(), 40000, replace=False)
    nodes["TRAIN_TEST"] = "TRAIN"
    test_condition = nodes["user_id"].isin(test_profiles)
    nodes.loc[test_condition, ["AGE", "gender"]] = np.nan
    nodes.loc[test_condition, ["TRAIN_TEST"]] = "TEST"

    return nodes

In [2]:
nodes, edges = load_and_select_profiles_and_edges()

loading profiles
loading edges
1    2642113
2    1992414
Name: source, dtype: int64
Selected profiles: 695406
Nodes with edges: 434590


In [3]:
nodes

Unnamed: 0,user_id,public,gender,AGE,TRAIN_TEST
0,1,1,1.0,26.0,TRAIN
4,4,1,0.0,26.0,TRAIN
5,17,1,0.0,27.0,TRAIN
6,5,1,1.0,26.0,TRAIN
7,18,1,1.0,18.0,TRAIN
...,...,...,...,...,...
1632778,1632779,1,,,TEST
1632781,1632782,1,,,TEST
1632782,1632783,1,1.0,37.0,TRAIN
1632784,1632785,1,1.0,37.0,TRAIN


In [4]:
edges

Unnamed: 0,smaller_id,greater_id
0,1,4
1,1,5
2,1,7
3,1,10
4,1,11
...,...,...
1992409,1632157,1632753
1992410,1632204,1632745
1992411,1632689,1632782
1992412,1632779,1632792


Mivel az 'edges' adat minden kapcsolatot egyszer tartalmaz így van olyan felhasználó, akinek csak a greater_id-ban szerepel a száma. Első lépésben az éleket duplikáltuk fordított sorrendben, így a "smaller_id"-ben gyűjtjük az adott embereket és a "greater_id"-ben pedig a kapcsolataikat. Ezáltal az összes felhasználó száma benne lesz mindekét oszlopban.


In [5]:
edges_reverse = edges.copy()
edges_reverse['greater_id']=edges['smaller_id']
edges_reverse['smaller_id']=edges['greater_id']
edges_reverse = edges_reverse.sort_values(by = ['smaller_id'])
edges_reverse

Unnamed: 0,smaller_id,greater_id
0,4,1
1,5,1
2,7,1
3,10,1
4,11,1
...,...,...
1992410,1632745,1632204
1992409,1632753,1632157
1992411,1632782,1632689
1992413,1632785,1632783


In [6]:
relations = pd.concat([edges,edges_reverse])
relations = relations.sort_values(by = ['smaller_id'])
relations

Unnamed: 0,smaller_id,greater_id
0,1,4
1,1,5
2,1,7
3,1,10
4,1,11
...,...,...
1992412,1632779,1632792
1992411,1632782,1632689
1992413,1632783,1632785
1992413,1632785,1632783


In [7]:
#A TRAIN jelzésű emberek kiválasztása

selected_profiles_train = nodes.copy()
selected_profiles_train = selected_profiles_train[selected_profiles_train['TRAIN_TEST']=='TRAIN']
selected_profiles_train

Unnamed: 0,user_id,public,gender,AGE,TRAIN_TEST
0,1,1,1.0,26.0,TRAIN
4,4,1,0.0,26.0,TRAIN
5,17,1,0.0,27.0,TRAIN
6,5,1,1.0,26.0,TRAIN
7,18,1,1.0,18.0,TRAIN
...,...,...,...,...,...
1632744,1632745,1,0.0,50.0,TRAIN
1632752,1632753,1,0.0,37.0,TRAIN
1632782,1632783,1,1.0,37.0,TRAIN
1632784,1632785,1,1.0,37.0,TRAIN


In [8]:
relations = relations.join(selected_profiles_train.set_index('user_id'),on='smaller_id')

In [9]:
relations = relations[relations['TRAIN_TEST']=='TRAIN']
relations

Unnamed: 0,smaller_id,greater_id,public,gender,AGE,TRAIN_TEST
0,1,4,1.0,1.0,26.0,TRAIN
1,1,5,1.0,1.0,26.0,TRAIN
2,1,7,1.0,1.0,26.0,TRAIN
3,1,10,1.0,1.0,26.0,TRAIN
4,1,11,1.0,1.0,26.0,TRAIN
...,...,...,...,...,...,...
1992410,1632745,1632204,1.0,0.0,50.0,TRAIN
1992409,1632753,1632157,1.0,0.0,37.0,TRAIN
1992413,1632783,1632785,1.0,1.0,37.0,TRAIN
1992413,1632785,1632783,1.0,1.0,37.0,TRAIN


In [10]:
relations=relations.drop(columns=['public','gender','AGE','TRAIN_TEST'])

In [11]:
# greater_id alapján nem meghatározása

relations = relations.join(nodes.set_index('user_id'),on='greater_id')
relations

Unnamed: 0,smaller_id,greater_id,public,gender,AGE,TRAIN_TEST
0,1,4,1,0.0,26.0,TRAIN
1,1,5,1,1.0,26.0,TRAIN
2,1,7,1,0.0,22.0,TRAIN
3,1,10,1,0.0,22.0,TRAIN
4,1,11,1,0.0,22.0,TRAIN
...,...,...,...,...,...,...
1992410,1632745,1632204,1,0.0,46.0,TRAIN
1992409,1632753,1632157,1,0.0,36.0,TRAIN
1992413,1632783,1632785,1,1.0,37.0,TRAIN
1992413,1632785,1632783,1,1.0,37.0,TRAIN


In [12]:
# így a 'smaller_id' oszloban vannak az egyes felhasználók, a 'greater_id'-ban a kapcsolataik és a 'gender'
# oszlopban a kapcsolataik neme, 1-es ha férfi, 0 ha nő

relations=relations.drop(columns=['public','AGE','TRAIN_TEST'])
relations

Unnamed: 0,smaller_id,greater_id,gender
0,1,4,0.0
1,1,5,1.0
2,1,7,0.0
3,1,10,0.0
4,1,11,0.0
...,...,...,...
1992410,1632745,1632204,0.0
1992409,1632753,1632157,0.0
1992413,1632783,1632785,1.0
1992413,1632785,1632783,1.0


In [13]:
# a 'gender' oszlopot átnevezzük 'friends_genderr'-é, mert szükség lesz még a 'gender' elnevezésre

relations = relations.rename(columns = {'gender' : 'friends_gender'})
relations

Unnamed: 0,smaller_id,greater_id,friends_gender
0,1,4,0.0
1,1,5,1.0
2,1,7,0.0
3,1,10,0.0
4,1,11,0.0
...,...,...,...
1992410,1632745,1632204,0.0
1992409,1632753,1632157,0.0
1992413,1632783,1632785,1.0
1992413,1632785,1632783,1.0


In [14]:
# ismét hozzá csatoljuk az eredeti adatokat, mert szükségünk van a felhasználók nemére

relations = relations.join(nodes.set_index('user_id'),on='smaller_id')
relations

Unnamed: 0,smaller_id,greater_id,friends_gender,public,gender,AGE,TRAIN_TEST
0,1,4,0.0,1,1.0,26.0,TRAIN
1,1,5,1.0,1,1.0,26.0,TRAIN
2,1,7,0.0,1,1.0,26.0,TRAIN
3,1,10,0.0,1,1.0,26.0,TRAIN
4,1,11,0.0,1,1.0,26.0,TRAIN
...,...,...,...,...,...,...,...
1992410,1632745,1632204,0.0,1,0.0,50.0,TRAIN
1992409,1632753,1632157,0.0,1,0.0,37.0,TRAIN
1992413,1632783,1632785,1.0,1,1.0,37.0,TRAIN
1992413,1632785,1632783,1.0,1,1.0,37.0,TRAIN


In [15]:
# kidobjuk a felesleges oszlopokat, így  maradnak a felhasználók (= 'smaller_id'), a kapcsolataik (= 'greater_id'),
# a kapcsolataik neme (= 'friends_gender') és a felhasználók neme (= 'gender')

relations=relations.drop(columns=['public','AGE','TRAIN_TEST'])
relations

Unnamed: 0,smaller_id,greater_id,friends_gender,gender
0,1,4,0.0,1.0
1,1,5,1.0,1.0
2,1,7,0.0,1.0
3,1,10,0.0,1.0
4,1,11,0.0,1.0
...,...,...,...,...
1992410,1632745,1632204,0.0,0.0
1992409,1632753,1632157,0.0,0.0
1992413,1632783,1632785,1.0,1.0
1992413,1632785,1632783,1.0,1.0


In [16]:
# rendezzük az oszlopokat, hogy logikailag következzenek egymás után

relations = relations [['smaller_id', 'greater_id', 'gender', 'friends_gender']]
relations

Unnamed: 0,smaller_id,greater_id,gender,friends_gender
0,1,4,1.0,0.0
1,1,5,1.0,1.0
2,1,7,1.0,0.0
3,1,10,1.0,0.0
4,1,11,1.0,0.0
...,...,...,...,...
1992410,1632745,1632204,0.0,0.0
1992409,1632753,1632157,0.0,0.0
1992413,1632783,1632785,1.0,1.0
1992413,1632785,1632783,1.0,1.0


In [17]:
# a felhasználók id-ja szerint csoportosítjuk, ezáltal a 'gender' oszlopban 0-t kapunk, ha az adott felhasználó nő,
# 0-nál nagyobb számot kapunk, ha férfi, a 'friends_gender' oszlopban pedig. az a szám fog szerepelni, amennyi férfi
#ismerőse van az illetőnek

malefriends = relations.groupby('smaller_id').sum()
malefriends

Unnamed: 0_level_0,greater_id,gender,friends_gender
smaller_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,49,6.0,2.0
4,1,0.0,1.0
5,1082,7.0,2.0
7,4070,0.0,10.0
10,6575,0.0,17.0
...,...,...,...
1632745,1632204,0.0,0.0
1632753,1632157,0.0,0.0
1632783,1632785,1.0,1.0
1632785,1632783,1.0,1.0


In [18]:
# összesen hány ismerőse van az egyes felhasználóknak

friends = pd.DataFrame(relations.groupby('smaller_id').size())
friends

Unnamed: 0_level_0,0
smaller_id,Unnamed: 1_level_1
1,6
4,1
5,7
7,20
10,28
...,...
1632745,1
1632753,1
1632783,1
1632785,1


In [19]:
friends.columns

RangeIndex(start=0, stop=1, step=1)

In [20]:
friends[0]

smaller_id
1           6
4           1
5           7
7          20
10         28
           ..
1632745     1
1632753     1
1632783     1
1632785     1
1632792     1
Name: 0, Length: 394590, dtype: int64

In [21]:
# a 'friends' oszlopot hozzáadjuk a dataframe-hez

malefriends['friends'] = friends[0]
malefriends

Unnamed: 0_level_0,greater_id,gender,friends_gender,friends
smaller_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,49,6.0,2.0,6
4,1,0.0,1.0,1
5,1082,7.0,2.0,7
7,4070,0.0,10.0,20
10,6575,0.0,17.0,28
...,...,...,...,...
1632745,1632204,0.0,0.0,1
1632753,1632157,0.0,0.0,1
1632783,1632785,1.0,1.0,1
1632785,1632783,1.0,1.0,1


In [22]:
malefriends.reset_index()

Unnamed: 0,smaller_id,greater_id,gender,friends_gender,friends
0,1,49,6.0,2.0,6
1,4,1,0.0,1.0,1
2,5,1082,7.0,2.0,7
3,7,4070,0.0,10.0,20
4,10,6575,0.0,17.0,28
...,...,...,...,...,...
394585,1632745,1632204,0.0,0.0,1
394586,1632753,1632157,0.0,0.0,1
394587,1632783,1632785,1.0,1.0,1
394588,1632785,1632783,1.0,1.0,1


In [23]:
# kiszámoljuk, hogy az egyes felhasználók esetében az ismerőseik mekkora aránya férfi, az így kapott oszlopnak a
# 'male percentage' nevet adjuk

malefriends['male percentage'] = malefriends['friends_gender']/malefriends['friends']
malefriends

Unnamed: 0_level_0,greater_id,gender,friends_gender,friends,male percentage
smaller_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,49,6.0,2.0,6,0.333333
4,1,0.0,1.0,1,1.000000
5,1082,7.0,2.0,7,0.285714
7,4070,0.0,10.0,20,0.500000
10,6575,0.0,17.0,28,0.607143
...,...,...,...,...,...
1632745,1632204,0.0,0.0,1,0.000000
1632753,1632157,0.0,0.0,1,0.000000
1632783,1632785,1.0,1.0,1,1.000000
1632785,1632783,1.0,1.0,1,1.000000


In [24]:
# összeszedjük a nőket egy külön dataframe-be

females = malefriends[malefriends['gender']==0]
females

Unnamed: 0_level_0,greater_id,gender,friends_gender,friends,male percentage
smaller_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
4,1,0.0,1.0,1,1.000000
7,4070,0.0,10.0,20,0.500000
10,6575,0.0,17.0,28,0.607143
11,4828,0.0,8.0,16,0.500000
17,1580,0.0,1.0,4,0.250000
...,...,...,...,...,...
1632664,1631228,0.0,1.0,1,1.000000
1632689,3264063,0.0,0.0,2,0.000000
1632745,1632204,0.0,0.0,1,0.000000
1632753,1632157,0.0,0.0,1,0.000000


In [25]:
# kiválasztjuk azokat a nőket, akiknek 4-nél több ismerősük van (ki akarjuk szűrni azokat, akiknek nagyon kevés
# kapcsolatuk van, mert ők nagyon torzítják a becslésünket)

females = females[females['friends'] > 4]
females

Unnamed: 0_level_0,greater_id,gender,friends_gender,friends,male percentage
smaller_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
7,4070,0.0,10.0,20,0.500000
10,6575,0.0,17.0,28,0.607143
11,4828,0.0,8.0,16,0.500000
25,21273,0.0,15.0,31,0.483871
29,10503,0.0,10.0,17,0.588235
...,...,...,...,...,...
1588215,7222354,0.0,2.0,5,0.400000
1589781,10001609,0.0,0.0,7,0.000000
1595248,7145696,0.0,0.0,5,0.000000
1612426,8878674,0.0,0.0,6,0.000000


In [26]:
# kiszámoljuk, hogy azoknak a nőknek, akiknek legalább 5 ismerősük van, az ismerőseik közül mekkora arányt
# képviselnek a férfiak

f = females['male percentage'].mean(axis=0)
print(f)

0.4395964510394471


In [27]:
# összeszedjük a férfiakat egy külön dataframe-be

males = malefriends[malefriends['gender']!=0]
males

Unnamed: 0_level_0,greater_id,gender,friends_gender,friends,male percentage
smaller_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,49,6.0,2.0,6,0.333333
5,1082,7.0,2.0,7,0.285714
12,2914,9.0,4.0,9,0.444444
18,2924,13.0,8.0,13,0.615385
19,25434,48.0,6.0,48,0.125000
...,...,...,...,...,...
1632700,1631220,1.0,0.0,1,0.000000
1632704,1631689,1.0,0.0,1,0.000000
1632724,1631566,1.0,0.0,1,0.000000
1632783,1632785,1.0,1.0,1,1.000000


In [28]:
# kiválasztjuk azokat a férfiakat, akiknek 4-nél több ismerősük van

males = males[males['friends'] > 4]
males

Unnamed: 0_level_0,greater_id,gender,friends_gender,friends,male percentage
smaller_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,49,6.0,2.0,6,0.333333
5,1082,7.0,2.0,7,0.285714
12,2914,9.0,4.0,9,0.444444
18,2924,13.0,8.0,13,0.615385
19,25434,48.0,6.0,48,0.125000
...,...,...,...,...,...
1587209,50085868,31.0,25.0,31,0.806452
1588557,6995462,5.0,2.0,5,0.400000
1588813,6995206,5.0,2.0,5,0.400000
1589159,9733943,6.0,5.0,6,0.833333


In [29]:
# kiszámoljuk, hogy azoknak a férfiaknak, akiknek legalább 5 ismerősük van, az ismerőseik közül mekkora arányt
# képviselnek a férfiak

m = males['male percentage'].mean(axis=0)
print(m)

0.4632309819653229


In [30]:
# Már tudjuk, hogy a férfiak és a nők ismerősei között hogy arányulnak egymáshoz a nemek
# A férfiak ismerőseinek kb. 46,32%-a férfi
# A nők ismerőseinek kb. 43,96%-a férfi
# A kettő átlagát használjuk cut-value-ként.

cut_value = (m+f)/2
print(cut_value)

0.45141371650238504


A cut-value tehát kb. 45,14%

Becslésünk szerint, ha egy felhasználó ismerőseinek több, mint 45,14%-a férfi, akkor férfinak becsüljük, ha kevesebb, akkor nőnek.

In [31]:
prediction = malefriends.copy()

In [32]:
# Ha férfi barátok aránya nagyobbb, mint a cut_value, akkor férfi a tipp, egyébként nő.

prediction['prediction'] = np.where(prediction['friends_gender']/prediction['friends'] > cut_value,1,0)
prediction.reset_index()
prediction.columns

Index(['greater_id', 'gender', 'friends_gender', 'friends', 'male percentage',
       'prediction'],
      dtype='object')

In [33]:
# Újra betöltjük az adatokat és ellenőrizzük az esetek hány százalékában találtuk el a nemet

selected_profiles_check,undirected_edges_check=load_and_select_profiles_and_edges_resultcheck()

loading profiles
loading edges
1    2642113
2    1992414
Name: source, dtype: int64
Selected profiles: 695406
Nodes with edges: 434590


In [34]:
prediction.reset_index()

Unnamed: 0,smaller_id,greater_id,gender,friends_gender,friends,male percentage,prediction
0,1,49,6.0,2.0,6,0.333333,0
1,4,1,0.0,1.0,1,1.000000,1
2,5,1082,7.0,2.0,7,0.285714,0
3,7,4070,0.0,10.0,20,0.500000,1
4,10,6575,0.0,17.0,28,0.607143,1
...,...,...,...,...,...,...,...
394585,1632745,1632204,0.0,0.0,1,0.000000,0
394586,1632753,1632157,0.0,0.0,1,0.000000,0
394587,1632783,1632785,1.0,1.0,1,1.000000,1
394588,1632785,1632783,1.0,1.0,1,1.000000,1


In [35]:
prediction = prediction.drop(columns=['friends_gender', 'gender'])

In [36]:
prediction = prediction.reset_index()

In [37]:
# Az ellenőrzéshez ismét a dataframe-hez csatoljuk az eredeti adatokat, hogy kinyerjük a 'gender' oszlopot.

selected_profiles_check
prediction = prediction.join(selected_profiles_check.set_index('user_id'),on='smaller_id')

In [38]:
# Kidobjuk a felesleges oszlopokat

prediction = prediction.drop(columns=['public','AGE', 'region','male percentage'])
prediction

Unnamed: 0,smaller_id,greater_id,friends,prediction,gender
0,1,49,6,0,1.0
1,4,1,1,1,0.0
2,5,1082,7,0,1.0
3,7,4070,20,1,0.0
4,10,6575,28,1,0.0
...,...,...,...,...,...
394585,1632745,1632204,1,0,0.0
394586,1632753,1632157,1,0,0.0
394587,1632783,1632785,1,1,1.0
394588,1632785,1632783,1,1,1.0


In [39]:
# Hozzáadjuk a 'result' oszlopot, mely értéke 1, ha a 'prediction' és a 'gender' megegyezik, különben 0.

prediction['result'] = np.where(prediction['prediction']==prediction['gender'],1,0)
prediction

Unnamed: 0,smaller_id,greater_id,friends,prediction,gender,result
0,1,49,6,0,1.0,0
1,4,1,1,1,0.0,0
2,5,1082,7,0,1.0,0
3,7,4070,20,1,0.0,0
4,10,6575,28,1,0.0,0
...,...,...,...,...,...,...
394585,1632745,1632204,1,0,0.0,1
394586,1632753,1632157,1,0,0.0,1
394587,1632783,1632785,1,1,1.0,1
394588,1632785,1632783,1,1,1.0,1


In [40]:
# Csak azokon végzünk előrejelzést, akiknek több, mint 4 ismerősük van

prediction = prediction[prediction['friends'] > 4]
prediction

Unnamed: 0,smaller_id,greater_id,friends,prediction,gender,result
0,1,49,6,0,1.0,0
2,5,1082,7,0,1.0,0
3,7,4070,20,1,0.0,0
4,10,6575,28,1,0.0,0
5,11,4828,16,1,0.0,0
...,...,...,...,...,...,...
391204,1589781,10001609,7,0,0.0,1
391634,1595248,7145696,5,0,0.0,1
393087,1612426,8878674,6,0,0.0,1
393091,1612451,8804427,6,0,0.0,1


In [41]:
# Kiszámoljuk, a modellünk hatékonyságát úgy, hogy összeadjuk a 'result' oszlop értékeit és ezt az összeget
# elosztjuk a sorok számával (tehát az összes becsült felhasználók számával). 

print('hatékonyság: ',prediction['result'].sum()/207395)

hatékonyság:  0.5237300802815883


A modellünk 52,37%-os pontossággal tudta előrejelezni egy felhasználó nemét.