### Import Libraries

In [1]:
pip install scipy




In [2]:
import pandas as pd
import  pickle
import numpy as np
from scipy.stats import halfnorm

In [3]:
with open("ProfileData_pickleFiles/profile_data.pkl",'rb') as fp:
    df = pickle.load(fp)

In [4]:
df

Unnamed: 0,Bios,Movies,TV,Religion,Music,Sports,Travelling,Foodie,Books,Politics,Finance,Coding
0,Evil beer aficionado. Freelance introvert. Tv ...,4,5,1,1,5,0,6,2,0,5,5
1,Passionate gamer. Evil internet aficionado. St...,5,3,5,0,5,6,3,1,2,4,7
2,Social media guru. Tv expert. Amateur beer eva...,0,3,2,5,1,7,0,4,0,7,9
3,Reader. Incurable analyst. Proud pop culture e...,4,3,7,2,1,3,1,5,9,4,5
4,Typical gamer. Friend of animals everywhere. A...,2,9,1,2,6,9,3,4,5,4,4
...,...,...,...,...,...,...,...,...,...,...,...,...
745,Social media fanatic. Typical coffee buff. Fre...,7,9,2,7,6,7,5,1,8,9,6
746,Student. Reader. Bacon trailblazer. Explorer. ...,8,6,7,6,5,0,6,6,0,8,0
747,Reader. Tv lover. Beer nerd. Incurable interne...,9,2,3,4,4,5,3,6,3,8,4
748,Amateur travel expert. Organizer. Alcohol fana...,9,0,2,1,0,9,3,0,2,8,3


In [5]:
# Using only Bios
# Removing the numerical data
df = df[['Bios']]

In [6]:
df

Unnamed: 0,Bios
0,Evil beer aficionado. Freelance introvert. Tv ...
1,Passionate gamer. Evil internet aficionado. St...
2,Social media guru. Tv expert. Amateur beer eva...
3,Reader. Incurable analyst. Proud pop culture e...
4,Typical gamer. Friend of animals everywhere. A...
...,...
745,Social media fanatic. Typical coffee buff. Fre...
746,Student. Reader. Bacon trailblazer. Explorer. ...
747,Reader. Tv lover. Beer nerd. Incurable interne...
748,Amateur travel expert. Organizer. Alcohol fana...


### Creating Lists for the Categories
Here, we are specifying the categories into subcategories and allocating the probability of being an interest to each subcategory of interest

In [7]:
#Probability distribution
p = {}

# TV Genres
tv = ['Comedy',
      'Drama',
      'Action/Adventure',
      'Suspense/Thriller',
      'Documentaries',
      'Crime/Mystery',
      'News',
      'SciFi',
      'History']

p['TV'] = [0.25,
           0.21,
           0.17,
           0.16,
           0.09,
           0.08,
           0.03,
           0.02,
           0.01]

# Movie Genres
movies = ['Adventure',
          'Action',
          'Drama',
          'Comedy',
          'Thriller',
          'Horror',
          'RomCom',
          'Musical',
          'Documentary']

p['Movies'] = [0.26,
               0.21,
               0.16,
               0.14,
               0.09,
               0.06,
               0.04,
               0.01, 
               0.03]

# Religions (could potentially create a spectrum)
religion = ['Catholic',
            'Christian',
            'Jewish',
            'Muslim',
            'Hindu',
            'Buddhist',
            'Spiritual',
            'Other',
            'Agnostic',
            'Atheist']

p['Religion'] = [0.07,
                 0.13,
                 0.01,
                 0.19,
                 0.24,
                 0.05,
                 0.10,
                 0.09,
                 0.07,
                 0.05]

# Music
music = ['Rock',
         'HipHop',
         'Romantic',
         'Pop',
         'Country',
         'EDM',
         'Jazz',
         'Classical',
         ]

p['Music'] = [0.25,
              0.19,
              0.16,
              0.14,
              0.10,
              0.06,
              0.04,
              0.03,
              0.02,
              0.01,]

# Sports
sports = [
          'Cricket',
          'Chess',
          'Badminton'
          'Football',
          'Baseball',
          'Basketball',
          'Hockey',
          'Soccer',
          'Other']

p['Sports'] = [0.29,
               0.24,
               0.23, 
               0.13,
               0.04,
               0.03,
               0.02,
               0.02]

# Politics (could also put on a spectrum)
politics = ['Liberal',
            'Progressive',
            'Centrist',
            'Moderate',
            'Conservative']

p['Politics'] = [0.26,
                 0.11,
                 0.11,
                 0.15,
                 0.37]

# Social Media
social = ['Facebook',
          'Youtube',
          'Twitter',
          'Reddit',
          'Instagram',
          'Pinterest',
          'LinkedIn',
          'SnapChat',
          'TikTok']

p['Social Media'] = [0.36,
                     0.27,
                     0.11,
                     0.09,
                     0.05,
                     0.03,
                     0.03,
                     0.03,
                     0.03]

# Programming
programming = [
          'Python',
          'Java',
          'JavaScript',
          'C++',
          'C#',
          'Swift',
          'Go'
         ]

p['programming'] = [
                    0.23,
                    0.20,
                    0.18,
                    0.15,
                    0.12,
                    0.10,
                    0.02
                   ]

#travelling
travelling = [
            'Treking',
            'Adventure',
            'Long Trips',
            'Short journeys'
             ]

p['travelling'] = [
                 0.35,
                 0.33,
                 0.21,
                 0.11
                  ]


# Age (generating random numbers based on half normal distribution)
age = halfnorm.rvs(loc=18,scale=8, size=df.shape[0]).astype(int)

# Lists of Names and the list of the lists
categories = [movies, religion, music, politics, social, sports,programming, travelling, age]

names = ['Movies','Religion', 'Music', 'Politics', 'Social Media', 'Sports', 'Programming', 'Traveller', 'Age']

combined = dict(zip(names, categories))

In [8]:
combined

{'Movies': ['Adventure',
  'Action',
  'Drama',
  'Comedy',
  'Thriller',
  'Horror',
  'RomCom',
  'Musical',
  'Documentary'],
 'Religion': ['Catholic',
  'Christian',
  'Jewish',
  'Muslim',
  'Hindu',
  'Buddhist',
  'Spiritual',
  'Other',
  'Agnostic',
  'Atheist'],
 'Music': ['Rock',
  'HipHop',
  'Romantic',
  'Pop',
  'Country',
  'EDM',
  'Jazz',
  'Classical'],
 'Politics': ['Liberal',
  'Progressive',
  'Centrist',
  'Moderate',
  'Conservative'],
 'Social Media': ['Facebook',
  'Youtube',
  'Twitter',
  'Reddit',
  'Instagram',
  'Pinterest',
  'LinkedIn',
  'SnapChat',
  'TikTok'],
 'Sports': ['Cricket',
  'Chess',
  'BadmintonFootball',
  'Baseball',
  'Basketball',
  'Hockey',
  'Soccer',
  'Other'],
 'Programming': ['Python', 'Java', 'JavaScript', 'C++', 'C#', 'Swift', 'Go'],
 'Traveller': ['Treking', 'Adventure', 'Long Trips', 'Short journeys'],
 'Age': array([33, 22, 19, 24, 24, 30, 23, 26, 18, 26, 19, 22, 21, 26, 23, 21, 19,
        31, 21, 21, 18, 20, 19, 19, 29, 2

### Assigning random values to each subcategory of a category
Looping through the combined df, we will assign randomly generated value from each subcategory to the repsective category

In [9]:
# Looping through and assigning random values
for name, ops in combined.items():
    if name in ['Religion', 'Politics']:
        # Picking only 1 from the list
        df[name] = np.random.choice(ops, df.shape[0], p=p[name])
        
    elif name == 'Age':
        # Generating random ages based on a normal distribution
        df[name] = ops
    else:
        # Picking 3 from the list 
        try:
            df[name] = list(np.random.choice(ops, size=(df.shape[0],1,3), p=p[name]))
        except:
            df[name] = list(np.random.choice(ops, size=(df.shape[0],1,3)))
        
        df[name] = df[name].apply(lambda x: list(set(x[0].tolist())))
        

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[name] = list(np.random.choice(ops, size=(df.shape[0],1,3), p=p[name]))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[name] = df[name].apply(lambda x: list(set(x[0].tolist())))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[name] = np.random.choice(ops, df.shape[0], p=p[name])
A value is t

In [10]:
df

Unnamed: 0,Bios,Movies,Religion,Music,Politics,Social Media,Sports,Programming,Traveller,Age
0,Evil beer aficionado. Freelance introvert. Tv ...,"[Adventure, RomCom, Action]",Buddhist,"[Country, Pop, Romantic]",Progressive,"[LinkedIn, Facebook]","[Baseball, Chess, Basketball]","[C++, Java, C#]","[Short journeys, Long Trips, Adventure]",33
1,Passionate gamer. Evil internet aficionado. St...,"[Musical, Drama, Comedy]",Muslim,"[Jazz, Romantic, Rock]",Moderate,"[Youtube, Facebook, Pinterest]","[Cricket, Baseball, BadmintonFootball]","[C++, Swift]","[Short journeys, Long Trips, Adventure]",22
2,Social media guru. Tv expert. Amateur beer eva...,"[Adventure, RomCom, Action]",Christian,"[Romantic, EDM, HipHop]",Liberal,"[Youtube, Facebook]","[Cricket, BadmintonFootball, Chess]","[C++, Swift, Go]","[Short journeys, Long Trips, Adventure]",19
3,Reader. Incurable analyst. Proud pop culture e...,"[Adventure, Action, Thriller]",Other,"[Country, Romantic]",Liberal,"[Youtube, Facebook]","[Baseball, Cricket]","[Java, JavaScript]","[Long Trips, Adventure, Treking]",24
4,Typical gamer. Friend of animals everywhere. A...,"[Adventure, Drama, Action]",Christian,"[Jazz, Romantic, HipHop]",Liberal,"[Youtube, Facebook, SnapChat]","[Cricket, Hockey, Basketball]","[C++, Java, Swift]","[Adventure, Treking]",24
...,...,...,...,...,...,...,...,...,...,...
745,Social media fanatic. Typical coffee buff. Fre...,"[Drama, Adventure, Comedy]",Other,"[Classical, Romantic, HipHop]",Conservative,[Facebook],"[Baseball, Cricket, Chess]","[Python, Java, Go]","[Adventure, Treking]",23
746,Student. Reader. Bacon trailblazer. Explorer. ...,"[Drama, RomCom, Adventure]",Spiritual,"[Country, Pop, HipHop]",Conservative,"[Reddit, Facebook]","[Cricket, Baseball, Chess]","[JavaScript, Java, Go]","[Short journeys, Long Trips, Treking]",42
747,Reader. Tv lover. Beer nerd. Incurable interne...,"[Horror, Action, Thriller]",Hindu,"[Jazz, Country, Romantic]",Centrist,"[Youtube, Instagram]","[Cricket, Baseball, Hockey]","[Java, JavaScript]","[Short journeys, Long Trips, Adventure]",26
748,Amateur travel expert. Organizer. Alcohol fana...,"[Action, Adventure]",Christian,"[Pop, Romantic, HipHop]",Moderate,"[Youtube, Reddit, Facebook]","[Baseball, Other, BadmintonFootball]","[C++, Go]","[Short journeys, Long Trips, Treking]",30


### Categorizing the religion and poilitics

In [11]:
df['Religion'] = pd.Categorical(df.Religion, ordered=True,
                                categories=['Catholic',
                                            'Christian',
                                            'Jewish',
                                            'Muslim',
                                            'Hindu',
                                            'Buddhist',
                                            'Spiritual',
                                            'Other',
                                            'Agnostic',
                                            'Atheist'])

df['Politics'] = pd.Categorical(df.Politics, ordered=True,
                                categories=['Liberal',
                                            'Progressive',
                                            'Centrist',
                                            'Moderate',
                                            'Conservative'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Religion'] = pd.Categorical(df.Religion, ordered=True,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Politics'] = pd.Categorical(df.Politics, ordered=True,


In [12]:
df

Unnamed: 0,Bios,Movies,Religion,Music,Politics,Social Media,Sports,Programming,Traveller,Age
0,Evil beer aficionado. Freelance introvert. Tv ...,"[Adventure, RomCom, Action]",Buddhist,"[Country, Pop, Romantic]",Progressive,"[LinkedIn, Facebook]","[Baseball, Chess, Basketball]","[C++, Java, C#]","[Short journeys, Long Trips, Adventure]",33
1,Passionate gamer. Evil internet aficionado. St...,"[Musical, Drama, Comedy]",Muslim,"[Jazz, Romantic, Rock]",Moderate,"[Youtube, Facebook, Pinterest]","[Cricket, Baseball, BadmintonFootball]","[C++, Swift]","[Short journeys, Long Trips, Adventure]",22
2,Social media guru. Tv expert. Amateur beer eva...,"[Adventure, RomCom, Action]",Christian,"[Romantic, EDM, HipHop]",Liberal,"[Youtube, Facebook]","[Cricket, BadmintonFootball, Chess]","[C++, Swift, Go]","[Short journeys, Long Trips, Adventure]",19
3,Reader. Incurable analyst. Proud pop culture e...,"[Adventure, Action, Thriller]",Other,"[Country, Romantic]",Liberal,"[Youtube, Facebook]","[Baseball, Cricket]","[Java, JavaScript]","[Long Trips, Adventure, Treking]",24
4,Typical gamer. Friend of animals everywhere. A...,"[Adventure, Drama, Action]",Christian,"[Jazz, Romantic, HipHop]",Liberal,"[Youtube, Facebook, SnapChat]","[Cricket, Hockey, Basketball]","[C++, Java, Swift]","[Adventure, Treking]",24
...,...,...,...,...,...,...,...,...,...,...
745,Social media fanatic. Typical coffee buff. Fre...,"[Drama, Adventure, Comedy]",Other,"[Classical, Romantic, HipHop]",Conservative,[Facebook],"[Baseball, Cricket, Chess]","[Python, Java, Go]","[Adventure, Treking]",23
746,Student. Reader. Bacon trailblazer. Explorer. ...,"[Drama, RomCom, Adventure]",Spiritual,"[Country, Pop, HipHop]",Conservative,"[Reddit, Facebook]","[Cricket, Baseball, Chess]","[JavaScript, Java, Go]","[Short journeys, Long Trips, Treking]",42
747,Reader. Tv lover. Beer nerd. Incurable interne...,"[Horror, Action, Thriller]",Hindu,"[Jazz, Country, Romantic]",Centrist,"[Youtube, Instagram]","[Cricket, Baseball, Hockey]","[Java, JavaScript]","[Short journeys, Long Trips, Adventure]",26
748,Amateur travel expert. Organizer. Alcohol fana...,"[Action, Adventure]",Christian,"[Pop, Romantic, HipHop]",Moderate,"[Youtube, Reddit, Facebook]","[Baseball, Other, BadmintonFootball]","[C++, Go]","[Short journeys, Long Trips, Treking]",30


### Saving and exporting the pickle file

In [13]:
with open("refined_profiles.pkl",'wb') as fp:
    pickle.dump(df, fp)