In [2]:
import pandas as pd
import numpy as np
import nltk
nltk.download('punkt')
from lenskit.algorithms import Recommender
from lenskit.algorithms.user_knn import UserUser
from lenskit.algorithms import user_knn
from lenskit.algorithms import Recommender

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/kirillgugunishvili/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
# Scenario/ context:
# organisation helping with applications
# offices around the world
# limited resources so tries to group people as best it can (i.e. according to degree)
#   but sometimes not possible so just groups however comes into single session 
# people sign up for a session and can be grouped with people most similar to them 
# recommender engine is needed for organisers of the company to ensure they are able to conduct 
#   worshops that intrest the majority of group members in a session

# Procedure:
#   Get all people from same state
#   If high school only then separate from others
#   Else String match people based on their degree field
#   Limit group size: min=5, max=20

# Bonus:
#   Group by expirence (noobies to noobies)

# Another idea:
#   Online consultations -> state is not as important, only country for same language
#   Grouping mostly based on degrees + expirience

# load intial data
user_data = pd.read_csv("dataset/users.tsv", sep='\t')
display(user_data.head(10))


Unnamed: 0,UserID,WindowID,Split,City,State,Country,ZipCode,DegreeType,Major,GraduationDate,WorkHistoryCount,TotalYearsExperience,CurrentlyEmployed,ManagedOthers,ManagedHowMany
0,47,1,Train,Paramount,CA,US,90723,High School,,1999-06-01 00:00:00,3,10.0,Yes,No,0
1,72,1,Train,La Mesa,CA,US,91941,Master's,Anthropology,2011-01-01 00:00:00,10,8.0,Yes,No,0
2,80,1,Train,Williamstown,NJ,US,8094,High School,Not Applicable,1985-06-01 00:00:00,5,11.0,Yes,Yes,5
3,98,1,Train,Astoria,NY,US,11105,Master's,Journalism,2007-05-01 00:00:00,3,3.0,Yes,No,0
4,123,1,Train,Baton Rouge,LA,US,70808,Bachelor's,Agricultural Business,2011-05-01 00:00:00,1,9.0,Yes,No,0
5,131,1,Train,Houston,TX,US,77077,Bachelor's,Finance,1998-05-01 00:00:00,3,14.0,,No,0
6,162,1,Train,Long Beach,CA,US,90807,Master's,I/O Psychology,2012-05-01 00:00:00,10,25.0,No,No,0
7,178,1,Train,Greenville,SC,US,29609,High School,Not Applicable,,6,35.0,No,Yes,4
8,203,1,Train,Colchester,VT,US,5446,Master's,Burlington,,4,3.0,Yes,No,0
9,344,1,Train,Newport News,VA,US,23601,High School,Not Applicable,2007-01-01 00:00:00,3,7.0,Yes,No,0


In [4]:
# extract geoposition list
cnt_col = user_data[["Country"]].drop_duplicates()
display(cnt_col)


Unnamed: 0,Country
0,US
131,BD
291,CA
379,IN
465,ES
...,...
329990,TC
330250,KZ
347339,AG
354762,PA


In [5]:
# extract high schoolers
undergrads_data = user_data[user_data["DegreeType"]=="High School"]
undergrads_data = undergrads_data.reset_index(drop=True).fillna("Not Applicable")
display(undergrads_data)

# extract non high schoolers
grads_data = user_data.fillna("Not Applicable")
grads_data = grads_data[(grads_data["DegreeType"]!="High School") & (grads_data["DegreeType"]!="Not Applicable")]
grads_data = grads_data.reset_index(drop=True)
display(grads_data)

# extract no education
noedu_data = user_data.fillna("Not Applicable")
noedu_data = noedu_data[noedu_data["DegreeType"]=="Not Applicable"]
noedu_data = noedu_data.reset_index(drop=True)
display(noedu_data)


Unnamed: 0,UserID,WindowID,Split,City,State,Country,ZipCode,DegreeType,Major,GraduationDate,WorkHistoryCount,TotalYearsExperience,CurrentlyEmployed,ManagedOthers,ManagedHowMany
0,47,1,Train,Paramount,CA,US,90723,High School,Not Applicable,1999-06-01 00:00:00,3,10.0,Yes,No,0
1,80,1,Train,Williamstown,NJ,US,08094,High School,Not Applicable,1985-06-01 00:00:00,5,11.0,Yes,Yes,5
2,178,1,Train,Greenville,SC,US,29609,High School,Not Applicable,Not Applicable,6,35.0,No,Yes,4
3,344,1,Train,Newport News,VA,US,23601,High School,Not Applicable,2007-01-01 00:00:00,3,7.0,Yes,No,0
4,496,1,Train,Easley,SC,US,29640,High School,Not Applicable,1987-01-01 00:00:00,10,21.0,No,No,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
93300,1471765,7,Train,Kennesaw,GA,US,30144,High School,Not Applicable,2012-06-01 00:00:00,1,2.0,No,No,0
93301,1471878,7,Train,Orangeburg,SC,US,29115,High School,Not Applicable,1999-01-01 00:00:00,4,3.0,No,No,0
93302,1471997,7,Train,Baltimore,MD,US,21222,High School,Math,2004-06-01 00:00:00,3,5.0,No,Yes,45
93303,1472042,7,Train,Saint Clair Shores,MI,US,48081,High School,Not Applicable,1970-01-01 00:00:00,7,32.0,Yes,No,0


Unnamed: 0,UserID,WindowID,Split,City,State,Country,ZipCode,DegreeType,Major,GraduationDate,WorkHistoryCount,TotalYearsExperience,CurrentlyEmployed,ManagedOthers,ManagedHowMany
0,72,1,Train,La Mesa,CA,US,91941,Master's,Anthropology,2011-01-01 00:00:00,10,8.0,Yes,No,0
1,98,1,Train,Astoria,NY,US,11105,Master's,Journalism,2007-05-01 00:00:00,3,3.0,Yes,No,0
2,123,1,Train,Baton Rouge,LA,US,70808,Bachelor's,Agricultural Business,2011-05-01 00:00:00,1,9.0,Yes,No,0
3,131,1,Train,Houston,TX,US,77077,Bachelor's,Finance,1998-05-01 00:00:00,3,14.0,Not Applicable,No,0
4,162,1,Train,Long Beach,CA,US,90807,Master's,I/O Psychology,2012-05-01 00:00:00,10,25.0,No,No,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
196245,1471622,7,Train,Jersey City,NJ,US,07307,Bachelor's,Psychology,2012-06-01 00:00:00,4,8.0,Yes,No,0
196246,1471633,7,Train,Sacramento,CA,US,95823,Bachelor's,Deaf Education,1985-01-01 00:00:00,9,14.0,Yes,Yes,30
196247,1471828,7,Train,Atlanta,GA,US,30312,PhD,Not Applicable,2006-05-01 00:00:00,3,6.0,Yes,No,0
196248,1472058,7,Train,Maspeth,NY,US,11378,Associate's,Not Applicable,2001-01-01 00:00:00,2,8.0,Yes,Yes,3


Unnamed: 0,UserID,WindowID,Split,City,State,Country,ZipCode,DegreeType,Major,GraduationDate,WorkHistoryCount,TotalYearsExperience,CurrentlyEmployed,ManagedOthers,ManagedHowMany
0,411,1,Train,Lutz,FL,US,33559,Not Applicable,Not Applicable,Not Applicable,3,11.0,Yes,Yes,1
1,640,1,Train,Cooper City,FL,US,33330,Not Applicable,Not Applicable,Not Applicable,4,34.0,Not Applicable,No,0
2,1113,1,Train,Ramsey,NJ,US,07446,Not Applicable,Not Applicable,2011-10-01 00:00:00,2,1.0,No,No,0
3,1123,1,Train,Jacksonville,FL,US,32256,Not Applicable,Civil Engineering and Architecture,2012-12-01 00:00:00,3,9.0,Yes,Yes,100
4,1226,1,Train,Cincinnati,OH,US,45245,Not Applicable,Licensed Practical/Vocational Nurse Training (...,1988-12-01 00:00:00,8,23.0,No,No,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100148,1471719,7,Train,Baltimore,MD,US,21216,Not Applicable,Customer service,2008-01-01 00:00:00,0,Not Applicable,No,No,0
100149,1471870,7,Train,Lubbock,TX,US,79412,Not Applicable,Criminal Justice,2003-01-01 00:00:00,5,19.0,No,No,0
100150,1471877,7,Train,Greer,SC,US,29650,Not Applicable,Business Administration,1988-01-01 00:00:00,9,30.0,Not Applicable,No,0
100151,1471901,7,Train,Port Clinton,OH,US,43452,Not Applicable,Not Applicable,Not Applicable,7,25.0,No,No,0


In [14]:
# Count how many jobs
counts = grads_data.groupby(["Major"]).size().reset_index(name='count').sort_values(by='count', ascending=False)
display(counts.head(20))

Unnamed: 0,Major,count
25535,Not Applicable,20521
5010,Business Administration,10725
384,Accounting,6672
6178,Business Management,4801
27972,Psychology,4323
4817,Business,3123
10927,Criminal Justice,2964
14939,Finance,2706
22674,Marketing,2701
5371,"Business Administration and Management, General",2499


In [7]:
# tokenize Major
g_maj_token = grads_data.apply(lambda row: nltk.word_tokenize(row["Major"]), axis=1)
display(g_maj_token.head(10))


0                   [Anthropology]
1                     [Journalism]
2         [Agricultural, Business]
3                        [Finance]
4                [I/O, Psychology]
5                     [Burlington]
6                      [Marketing]
7    [Office, Systems, Technology]
8                        [Nursing]
9                     [Accounting]
dtype: object

In [8]:
# 1-grams generated
itemCounter = 0
keywords = pd.DataFrame(columns=["Keyword"])
keywords = g_maj_token.explode().drop_duplicates().reset_index(drop=True)
display(keywords.head(20))

# create user-major-keyword df
grads_shrt = grads_data[["UserID", "Major"]]
grads_shrt["Keywords"] = ''
display(grads_shrt)

# fill in the keyword column
grads_shrt["Major"]

0     Anthropology
1       Journalism
2     Agricultural
3         Business
4          Finance
5              I/O
6       Psychology
7       Burlington
8        Marketing
9           Office
10         Systems
11      Technology
12         Nursing
13      Accounting
14             Not
15      Applicable
16      Restaurant
17             and
18         Tourism
19      Management
dtype: object

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  grads_shrt["Keywords"] = ''


Unnamed: 0,UserID,Major,Keywords
0,72,Anthropology,
1,98,Journalism,
2,123,Agricultural Business,
3,131,Finance,
4,162,I/O Psychology,
...,...,...,...
196245,1471622,Psychology,
196246,1471633,Deaf Education,
196247,1471828,Not Applicable,
196248,1472058,Not Applicable,


0                  Anthropology
1                    Journalism
2         Agricultural Business
3                       Finance
4                I/O Psychology
                  ...          
196245               Psychology
196246           Deaf Education
196247           Not Applicable
196248           Not Applicable
196249          Interior Design
Name: Major, Length: 196250, dtype: object

In [9]:
# full match try
grads_data

Unnamed: 0,UserID,WindowID,Split,City,State,Country,ZipCode,DegreeType,Major,GraduationDate,WorkHistoryCount,TotalYearsExperience,CurrentlyEmployed,ManagedOthers,ManagedHowMany
0,72,1,Train,La Mesa,CA,US,91941,Master's,Anthropology,2011-01-01 00:00:00,10,8.0,Yes,No,0
1,98,1,Train,Astoria,NY,US,11105,Master's,Journalism,2007-05-01 00:00:00,3,3.0,Yes,No,0
2,123,1,Train,Baton Rouge,LA,US,70808,Bachelor's,Agricultural Business,2011-05-01 00:00:00,1,9.0,Yes,No,0
3,131,1,Train,Houston,TX,US,77077,Bachelor's,Finance,1998-05-01 00:00:00,3,14.0,Not Applicable,No,0
4,162,1,Train,Long Beach,CA,US,90807,Master's,I/O Psychology,2012-05-01 00:00:00,10,25.0,No,No,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
196245,1471622,7,Train,Jersey City,NJ,US,07307,Bachelor's,Psychology,2012-06-01 00:00:00,4,8.0,Yes,No,0
196246,1471633,7,Train,Sacramento,CA,US,95823,Bachelor's,Deaf Education,1985-01-01 00:00:00,9,14.0,Yes,Yes,30
196247,1471828,7,Train,Atlanta,GA,US,30312,PhD,Not Applicable,2006-05-01 00:00:00,3,6.0,Yes,No,0
196248,1472058,7,Train,Maspeth,NY,US,11378,Associate's,Not Applicable,2001-01-01 00:00:00,2,8.0,Yes,Yes,3
