In [7]:
import pandas as pd
import numpy as np
import nltk
nltk.download('punkt')
from lenskit.algorithms import Recommender
from lenskit.algorithms.user_knn import UserUser
from lenskit.algorithms import user_knn
from lenskit.algorithms import Recommender

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/kirillgugunishvili/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [2]:
# Scenario/ context:
# organisation helping with applications
# offices around the world
# limited resources so tries to group people as best it can (i.e. according to degree)
#   but sometimes not possible so just groups however comes into single session 
# people sign up for a session and can be grouped with people most similar to them 
# recommender engine is needed for organisers of the company to ensure they are able to conduct 
#   worshops that intrest the majority of group members in a session

# Procedure:
#   Get all people from same state
#   If high school only then separate from others
#   Else String match people based on their degree field
#   Limit group size: min=5, max=20

# Bonus:
#   Group by expirence (noobies to noobies)

# Another idea:
#   Online consultations -> state is not as important, only country for same language
#   Grouping mostly based on degrees + expirience

# load intial data
user_data = pd.read_csv("dataset/users.tsv", sep='\t')
display(user_data.head(10))


Unnamed: 0,UserID,WindowID,Split,City,State,Country,ZipCode,DegreeType,Major,GraduationDate,WorkHistoryCount,TotalYearsExperience,CurrentlyEmployed,ManagedOthers,ManagedHowMany
0,47,1,Train,Paramount,CA,US,90723,High School,,1999-06-01 00:00:00,3,10.0,Yes,No,0
1,72,1,Train,La Mesa,CA,US,91941,Master's,Anthropology,2011-01-01 00:00:00,10,8.0,Yes,No,0
2,80,1,Train,Williamstown,NJ,US,8094,High School,Not Applicable,1985-06-01 00:00:00,5,11.0,Yes,Yes,5
3,98,1,Train,Astoria,NY,US,11105,Master's,Journalism,2007-05-01 00:00:00,3,3.0,Yes,No,0
4,123,1,Train,Baton Rouge,LA,US,70808,Bachelor's,Agricultural Business,2011-05-01 00:00:00,1,9.0,Yes,No,0
5,131,1,Train,Houston,TX,US,77077,Bachelor's,Finance,1998-05-01 00:00:00,3,14.0,,No,0
6,162,1,Train,Long Beach,CA,US,90807,Master's,I/O Psychology,2012-05-01 00:00:00,10,25.0,No,No,0
7,178,1,Train,Greenville,SC,US,29609,High School,Not Applicable,,6,35.0,No,Yes,4
8,203,1,Train,Colchester,VT,US,5446,Master's,Burlington,,4,3.0,Yes,No,0
9,344,1,Train,Newport News,VA,US,23601,High School,Not Applicable,2007-01-01 00:00:00,3,7.0,Yes,No,0


In [3]:
# extract geoposition list
cnt_col = user_data[["Country"]].drop_duplicates()
display(cnt_col)


Unnamed: 0,Country
0,US
131,BD
291,CA
379,IN
465,ES
...,...
329990,TC
330250,KZ
347339,AG
354762,PA


In [25]:
# extract high schoolers
undergrads_data = user_data[user_data["DegreeType"]=="High School"].fillna("Not Applicable")
undergrads_data = undergrads_data.reset_index(drop=True)
display(undergrads_data)

# extract non high schoolers
grads_data = user_data[user_data["DegreeType"]!="High School"].fillna("Not Applicable")
grads_data = grads_data.reset_index(drop=True)
display(grads_data)

Unnamed: 0,UserID,WindowID,Split,City,State,Country,ZipCode,DegreeType,Major,GraduationDate,WorkHistoryCount,TotalYearsExperience,CurrentlyEmployed,ManagedOthers,ManagedHowMany
0,47,1,Train,Paramount,CA,US,90723,High School,Not Applicable,1999-06-01 00:00:00,3,10.0,Yes,No,0
1,80,1,Train,Williamstown,NJ,US,08094,High School,Not Applicable,1985-06-01 00:00:00,5,11.0,Yes,Yes,5
2,178,1,Train,Greenville,SC,US,29609,High School,Not Applicable,Not Applicable,6,35.0,No,Yes,4
3,344,1,Train,Newport News,VA,US,23601,High School,Not Applicable,2007-01-01 00:00:00,3,7.0,Yes,No,0
4,496,1,Train,Easley,SC,US,29640,High School,Not Applicable,1987-01-01 00:00:00,10,21.0,No,No,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
93300,1471765,7,Train,Kennesaw,GA,US,30144,High School,Not Applicable,2012-06-01 00:00:00,1,2.0,No,No,0
93301,1471878,7,Train,Orangeburg,SC,US,29115,High School,Not Applicable,1999-01-01 00:00:00,4,3.0,No,No,0
93302,1471997,7,Train,Baltimore,MD,US,21222,High School,Math,2004-06-01 00:00:00,3,5.0,No,Yes,45
93303,1472042,7,Train,Saint Clair Shores,MI,US,48081,High School,Not Applicable,1970-01-01 00:00:00,7,32.0,Yes,No,0


Unnamed: 0,UserID,WindowID,Split,City,State,Country,ZipCode,DegreeType,Major,GraduationDate,WorkHistoryCount,TotalYearsExperience,CurrentlyEmployed,ManagedOthers,ManagedHowMany
0,72,1,Train,La Mesa,CA,US,91941,Master's,Anthropology,2011-01-01 00:00:00,10,8.0,Yes,No,0
1,98,1,Train,Astoria,NY,US,11105,Master's,Journalism,2007-05-01 00:00:00,3,3.0,Yes,No,0
2,123,1,Train,Baton Rouge,LA,US,70808,Bachelor's,Agricultural Business,2011-05-01 00:00:00,1,9.0,Yes,No,0
3,131,1,Train,Houston,TX,US,77077,Bachelor's,Finance,1998-05-01 00:00:00,3,14.0,Not Applicable,No,0
4,162,1,Train,Long Beach,CA,US,90807,Master's,I/O Psychology,2012-05-01 00:00:00,10,25.0,No,No,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
296398,1471877,7,Train,Greer,SC,US,29650,Not Applicable,Business Administration,1988-01-01 00:00:00,9,30.0,Not Applicable,No,0
296399,1471901,7,Train,Port Clinton,OH,US,43452,Not Applicable,Not Applicable,Not Applicable,7,25.0,No,No,0
296400,1471975,7,Train,Concord,NC,US,28027,Not Applicable,Not Applicable,Not Applicable,5,14.0,Yes,No,0
296401,1472058,7,Train,Maspeth,NY,US,11378,Associate's,Not Applicable,2001-01-01 00:00:00,2,8.0,Yes,Yes,3


In [26]:
# tokenize Major
g_maj_token = grads_data.apply(lambda row: nltk.word_tokenize(row["Major"]), axis=1)
display(g_maj_token.head(10))


0                   [Anthropology]
1                     [Journalism]
2         [Agricultural, Business]
3                        [Finance]
4                [I/O, Psychology]
5                     [Burlington]
6                      [Marketing]
7                [Not, Applicable]
8    [Office, Systems, Technology]
9                        [Nursing]
dtype: object

In [54]:
# 1-grams generated
itemCounter = 0
keywords = pd.DataFrame(columns=["Keyword"])
keywords = g_maj_token.explode().drop_duplicates()
display(keywords.head(20))

# 2-grams generated

0     Anthropology
1       Journalism
2     Agricultural
2         Business
3          Finance
4              I/O
4       Psychology
5       Burlington
6        Marketing
7              Not
7       Applicable
8           Office
8          Systems
8       Technology
9          Nursing
10      Accounting
12      Restaurant
12             and
12         Tourism
12      Management
dtype: object