In [1]:
import os
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [2]:
cwd = os.getcwd()
file_name = "MINDsmall_train/news.tsv"
file_path =  os.path.join(cwd, file_name)

In [3]:
news_df = pd.read_csv(file_path, sep='\t',header=None, names=['NewsID', 'Category', 'SubCategory', 'Title' , 'Abstract' , 'URL', 'TitleEntities', 'AbstractEntities'])

In [4]:
news_df.head()

Unnamed: 0,NewsID,Category,SubCategory,Title,Abstract,URL,TitleEntities,AbstractEntities
0,N55528,lifestyle,lifestyleroyals,"The Brands Queen Elizabeth, Prince Charles, an...","Shop the notebooks, jackets, and more that the...",https://assets.msn.com/labs/mind/AAGH0ET.html,"[{""Label"": ""Prince Philip, Duke of Edinburgh"",...",[]
1,N19639,health,weightloss,50 Worst Habits For Belly Fat,These seemingly harmless habits are holding yo...,https://assets.msn.com/labs/mind/AAB19MK.html,"[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik...","[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik..."
2,N61837,news,newsworld,The Cost of Trump's Aid Freeze in the Trenches...,Lt. Ivan Molchanets peeked over a parapet of s...,https://assets.msn.com/labs/mind/AAJgNsz.html,[],"[{""Label"": ""Ukraine"", ""Type"": ""G"", ""WikidataId..."
3,N53526,health,voices,I Was An NBA Wife. Here's How It Affected My M...,"I felt like I was a fraud, and being an NBA wi...",https://assets.msn.com/labs/mind/AACk2N6.html,[],"[{""Label"": ""National Basketball Association"", ..."
4,N38324,health,medical,"How to Get Rid of Skin Tags, According to a De...","They seem harmless, but there's a very good re...",https://assets.msn.com/labs/mind/AAAKEkt.html,"[{""Label"": ""Skin tag"", ""Type"": ""C"", ""WikidataI...","[{""Label"": ""Skin tag"", ""Type"": ""C"", ""WikidataI..."


In [5]:
news_df.drop(columns=['URL', 'TitleEntities' ,'AbstractEntities' ], inplace=True)

In [6]:
news_df.head()

Unnamed: 0,NewsID,Category,SubCategory,Title,Abstract
0,N55528,lifestyle,lifestyleroyals,"The Brands Queen Elizabeth, Prince Charles, an...","Shop the notebooks, jackets, and more that the..."
1,N19639,health,weightloss,50 Worst Habits For Belly Fat,These seemingly harmless habits are holding yo...
2,N61837,news,newsworld,The Cost of Trump's Aid Freeze in the Trenches...,Lt. Ivan Molchanets peeked over a parapet of s...
3,N53526,health,voices,I Was An NBA Wife. Here's How It Affected My M...,"I felt like I was a fraud, and being an NBA wi..."
4,N38324,health,medical,"How to Get Rid of Skin Tags, According to a De...","They seem harmless, but there's a very good re..."


In [7]:
news_df.isna().sum()

NewsID            0
Category          0
SubCategory       0
Title             0
Abstract       2666
dtype: int64

In [8]:
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())

PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')

In [31]:
from pinecone import Pinecone, ServerlessSpec

INDEX_NAME = 'pc-index-news'
pinecone = Pinecone(api_key=PINECONE_API_KEY)

if INDEX_NAME in [index.name for index in pinecone.list_indexes()]:
  pinecone.delete_index(INDEX_NAME)

pinecone.create_index(name=INDEX_NAME, dimension=384, metric='cosine',
  spec=ServerlessSpec(cloud='aws', region='us-east-1'))

index = pinecone.Index(INDEX_NAME)

In [32]:
# Confirm we indeed created our  index

pinecone.list_indexes().names()

['pc-index-news']

In [33]:
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

In [12]:
from sentence_transformers import SentenceTransformer


def get_embeddings(articles, model=SentenceTransformer('all-MiniLM-L6-v2')):
   return model.encode(articles)

In [14]:
import math

# Function to check if a value is NaN
def is_nan(value):
    try:
        return math.isnan(value)
    except:
        return False

In [34]:
from tqdm.auto import tqdm, trange

CHUNK_SIZE=200
TOTAL_ROWS=10000
progress_bar = tqdm(total= TOTAL_ROWS)
use_columns = [0,1,2,3,4]
namespace = 'NewsTitles'

chunks = pd.read_csv(file_path, sep='\t',header=None,usecols=use_columns , 
                     names=['NewsID', 'Category', 'SubCategory', 'Title' , 'Abstract' ]
                     , chunksize=CHUNK_SIZE
                     ,nrows=TOTAL_ROWS 
                     )



chunk_num = 0
for chunk in chunks:
    titles = chunk['Title'].tolist()
    newsIds = chunk['NewsID'].tolist()
    cleaned_titles = [title if not is_nan(title) else "" for title in titles]
    embeddings = get_embeddings(titles)
    prepped = [{'id':str(newsIds[i])
                , 'values':embeddings[i],
                'metadata':{'title':cleaned_titles[i]}} 
               for i in range(0,len(titles))]
    chunk_num = chunk_num + 1
    if len(prepped) >= 200:
        try:
            index.upsert(prepped , namespace=namespace)
        except Exception as e:
            print(f"Error upserting chunk {chunk_num}: {e}")
        prepped = []
    progress_bar.update(len(chunk))

#final
if prepped:
    try:
        index.upsert(prepped,namespace=namespace)
    except Exception as e:
        print(f"Error upserting final chunk: {e}")

100%|██████████| 10000/10000 [55:24<00:00,  3.01it/s] 
100%|██████████| 10000/10000 [01:45<00:00, 89.79it/s]

In [35]:
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {'NewsTitles': {'vector_count': 10000}},
 'total_vector_count': 10000}

In [40]:
def get_recommendations(pinecone_index, search_term, namespace = 'NewsTitles' ,top_k=10 ):
  embed = get_embeddings(search_term)
  res = pinecone_index.query(namespace = namespace ,vector=embed.tolist(), top_k=top_k, include_metadata=True )
  return res

In [42]:
reco = get_recommendations(index, 'royalty', 'NewsTitles')
for r in reco.matches:
    print(f'{r.score} : {r.id}-{r.metadata["title"]}')

0.575195551 : N7716-What Do Prince George & Princess Charlotte Know About Their Royal Roles?
0.56598568 : N42777-Prince George's Royal Life in Photos
0.510303736 : N3164-Queen's Speech in Parliament: Black Rod, hostage-taking and a gold stick
0.506915033 : N39208-21 Things You Never Knew About America's 'Royal' Families
0.503744 : N48912-The British Royal Family and Their Pets: Which Is Your Favorite?
0.502944469 : N55528-The Brands Queen Elizabeth, Prince Charles, and Prince Philip Swear By
0.490994751 : N38397-50 Incredible Photos of the Swedish Royal Family Through the Years
0.485220969 : N21477-Everything You Need to Know About the Swedish Royal Family
0.482100427 : N54727-The surprising age differences between your favorite royal couples
0.458627045 : N47247-Royal engagement rings


**Part2 : User Data**

In [123]:
file_name = "MINDsmall_train/behaviors.tsv"
file_path =  os.path.join(cwd, file_name)
behavior_df = pd.read_csv(file_path, sep='\t',header=None, names=['ImpressionID', 'UserID', 'Time', 'History' , 'Impressions' ])
behavior_df.head()

Unnamed: 0,ImpressionID,UserID,Time,History,Impressions
0,1,U13740,11/11/2019 9:05:58 AM,N55189 N42782 N34694 N45794 N18445 N63302 N10414 N19347 N31801,N55689-1 N35729-0
1,2,U91836,11/12/2019 6:11:30 PM,N31739 N6072 N63045 N23979 N35656 N43353 N8129 N1569 N17686 N13008 N21623 N6233 N14340 N48031 N62285 N44383 N23061 N16290 N6244 N45099 N58715 N59049 N7023 N50528 N42704 N46082 N8275 N15710 N59026 N8429 N30867 N56514 N19709 N31402 N31741 N54889 N9798 N62612 N2663 N16617 N6087 N13231 N63317 N61388 N59359 N51163 N30698 N34567 N54225 N32852 N55833 N64467 N3142 N13912 N29802 N44462 N29948 N4486 N5398 N14761 N47020 N65112 N31699 N37159 N61101 N14761 N3433 N10438 N61355 N21164 N22976 N2511 N48390 N58224 N48742 N35458 N24611 N37509 N21773 N41011 N19041 N25785,N20678-0 N39317-0 N58114-0 N20495-0 N42977-0 N22407-0 N14592-0 N17059-1 N33677-0 N7821-0 N6890-0
2,3,U73700,11/14/2019 7:01:48 AM,N10732 N25792 N7563 N21087 N41087 N5445 N60384 N46616 N52500 N33164 N47289 N24233 N62058 N26378 N49475 N18870,N50014-0 N23877-0 N35389-0 N49712-0 N16844-0 N59685-0 N23814-1 N23446-0 N64174-0 N11817-0 N60550-0 N48225-0 N45509-0 N56711-0 N46821-0 N48017-0 N8015-0 N5364-0 N48722-0 N55555-0 N37348-0 N40109-0 N59495-0 N36226-0 N38779-0 N47346-0 N48875-0 N10960-0 N29739-0 N50872-0 N50592-0 N13131-0 N3839-0 N12330-0 N47098-0 N51570-0
3,4,U34670,11/11/2019 5:28:05 AM,N45729 N2203 N871 N53880 N41375 N43142 N33013 N29757 N31825 N51891,N35729-0 N33632-0 N49685-1 N27581-0
4,5,U8125,11/12/2019 4:11:21 PM,N10078 N56514 N14904 N33740,N39985-0 N36050-0 N16096-0 N8400-1 N22407-0 N60408-0 N61497-0 N47412-0 N41220-0 N1940-0 N724-0 N11363-0 N261-0 N33883-0 N36807-0 N11967-0 N17896-0 N13486-0 N10413-0 N54274-0 N4247-0 N27497-0 N38512-0 N30253-0 N45389-0 N20015-0 N20678-0 N54003-0 N35850-0 N33261-0 N32010-0 N57426-0 N7419-0 N50023-0 N36446-0 N26940-0 N28495-0 N19318-0 N4936-0 N28414-0 N25108-0 N32791-0 N23563-0 N39317-0 N16166-0 N37058-0 N64851-0 N46992-0 N57327-0 N12995-0 N58363-0 N53084-0 N11094-0 N36436-0 N305-0 N58241-0 N33212-0 N6975-0 N58114-0 N3344-0 N25406-0 N4741-0 N33885-0 N20915-0 N44941-0 N57319-0 N36532-0 N61822-0 N20527-0


In [135]:
behavior_df[behavior_df['History'].isna()]

Unnamed: 0,ImpressionID,UserID,Time,History,Impressions
28,29,U33207,11/11/2019 11:09:14 AM,,N62212-0 N27521-0 N28983-0 N12028-0 N25437-0 N26262-0 N7041-1 N30475-0 N40065-0 N18708-0 N50406-0 N40530-0 N56598-0 N28324-0 N59981-0 N19735-0 N33848-0 N2823-1 N24686-0 N33872-0 N45456-0 N45076-0 N6056-1 N3894-0 N36545-0 N21915-0 N61359-0 N61623-0 N15761-0 N61549-0 N16804-0 N55204-0 N47433-0 N32791-0 N24180-0 N2869-0 N41698-0 N12800-0 N43338-0 N9139-0 N2-0 N5905-0 N41651-0 N6379-0 N45660-0 N28578-0 N45616-0 N3697-0 N58030-0 N8957-0 N17011-0 N3541-0 N54482-0 N27079-0 N21519-0 N48890-0 N49879-0 N26130-0 N41881-1 N38311-0 N23935-0 N1019-0 N18870-0 N51241-0 N43432-0 N23784-0 N47006-0 N47067-0 N63817-0 N4815-0 N47981-0 N15020-0 N47229-0 N62093-0 N60162-0 N1914-0 N11821-0 N13816-0 N6099-0 N30135-0 N55940-0 N30108-0 N33831-0 N23485-0 N4633-0 N43748-0 N31763-0 N51346-0 N11830-0 N3474-0 N36712-0 N7128-0 N36273-0 N53008-0 N37814-0 N21840-0 N10754-0 N59436-0 N17140-0 N57005-0 N5075-0 N27577-0 N64397-0 N8061-0 N40431-0 N6375-0 N20018-0 N14780-0 N25953-0 N47747-0 N36442-0 N47475-0 N17340-0 N11301-0 N43789-0 N4936-0 N53214-0 N44324-0 N36933-0 N7466-0 N5370-0 N13730-0 N24301-0 N64243-0 N21707-0 N35671-0 N42144-0 N2952-0 N64009-0 N54868-0 N63538-0 N35729-0 N22442-0 N26286-0 N43415-0 N50005-0 N36621-0 N59385-0 N40318-0 N48476-0 N10464-0 N48578-0 N26331-0 N13259-0 N17106-0 N52622-0 N6342-0 N42961-0 N58410-1 N55689-0 N29128-0 N17482-0 N54988-0 N22457-0 N29001-0 N38947-0 N46878-0 N26712-0 N49240-0 N58553-0 N63550-0 N52585-1 N23254-0 N24176-0 N57030-0 N26902-0 N61774-0 N15855-0 N59343-0 N53585-1 N13341-0 N3841-0 N46845-0 N40725-0
96,97,U89839,11/11/2019 4:57:51 AM,,N12042-0 N35729-0 N49685-0 N62729-1
155,156,U5598,11/13/2019 2:01:40 PM,,N7121-0 N37437-0 N53861-0 N45891-0 N21141-0 N50415-0 N14029-0 N30035-0 N4510-1 N26376-0 N35958-0 N41172-0 N22417-0 N17117-0 N64096-0 N13907-0 N62716-0 N21128-0 N38215-0 N8509-0 N55949-0 N47061-0 N35273-0 N4642-0 N11769-0 N7618-0 N43301-0 N58114-0 N21420-0 N15810-0 N2578-0 N38808-0 N55281-0 N56214-0 N45266-0 N36221-0
194,195,U19208,11/11/2019 3:57:34 PM,,N23414-1 N52474-0 N6099-0 N5980-0 N1914-0
257,258,U6879,11/14/2019 5:33:52 PM,,N9284-0 N23391-0 N42849-0 N33828-0 N57081-0 N27737-0 N54792-0 N17031-0 N52867-1 N55275-0 N6645-0 N16118-0 N5981-0 N6051-0 N7494-0 N50484-0 N41934-0 N55949-0 N15574-0 N63913-0 N21571-0 N1031-0 N16120-0 N36931-0 N28072-0 N36332-0 N64919-0 N60539-0 N20793-0 N62992-0 N19050-0 N11681-0 N64459-0 N29735-0 N55322-0 N21077-0 N61233-0 N29490-0 N3380-0 N43277-0 N45428-0 N48487-0 N59841-0 N19019-0 N30689-0 N14478-0 N36288-0 N23767-0 N18529-0 N15678-0 N53630-0 N34668-0 N14285-0 N37204-0 N47652-0 N8373-0 N61740-0 N12300-0 N37994-0 N22975-0 N34915-0 N45523-0 N1204-0 N47612-0 N6388-0 N42515-0 N53031-0 N14056-0 N58171-0 N56211-0 N4485-0 N6837-0 N18644-0 N22257-0 N54752-0 N63342-0 N41658-0 N9621-0 N8468-0 N11931-0 N36779-0 N54321-0 N48707-0
...,...,...,...,...,...
156850,156851,U85936,11/10/2019 8:52:02 AM,,N41140-1 N4148-0
156890,156891,U4828,11/11/2019 9:41:21 AM,,N8957-0 N60162-0 N16804-0 N24180-0 N31370-0 N22999-0 N48890-0 N40318-0 N26286-0 N43338-0 N59436-0 N35729-1 N15761-0 N29001-0 N42961-0 N64787-0 N8061-0 N19735-0 N33872-0 N58553-0 N50570-0 N24686-0 N27521-0 N29128-0 N40530-0 N10464-0 N32791-0 N49879-0 N33848-0 N38947-0 N154-0 N61623-0 N46845-0 N61359-1 N30135-0 N2869-0 N48476-0 N36621-0 N25953-0 N10754-0 N5075-0 N18378-0 N26376-0 N40431-0 N51346-0 N40065-0 N8399-0 N6099-0 N6379-0 N45375-0 N36442-0 N56598-0 N62212-0 N43789-0 N36712-0 N5370-0 N47229-0 N6342-0 N26331-0 N64243-0 N55204-0 N13259-0 N34390-0 N7128-0 N59981-0 N18708-0 N55689-0 N31727-0 N58710-0 N33159-0 N12523-0 N7809-0 N49240-0 N54988-0 N9271-0 N15435-0 N17140-0 N13801-0 N52622-0 N21707-0 N47067-0 N64542-0 N11830-0 N16342-0 N47981-0 N1914-0 N15855-0 N45076-0 N24176-0 N63538-0 N63550-0 N11821-0 N54482-0 N35776-0 N61214-0 N21915-0 N26043-0 N12800-0 N43415-0 N51639-1 N18870-0 N15830-0
156897,156898,U38290,11/14/2019 3:50:07 PM,,N59267-0 N12446-0 N53031-0 N3380-0 N56211-0 N54752-0 N27737-0 N36288-0 N37204-0 N38779-0 N39741-0 N44698-0 N9621-0 N14478-0 N57081-0 N19661-0 N6837-0 N52867-0 N42515-0 N3841-0 N9284-0 N57818-0 N41979-0 N45523-0 N55322-0 N37338-0 N23089-0 N41934-0 N37994-0 N34185-1 N19050-0 N47652-0 N14638-0 N61233-0 N10960-0 N33828-0 N22975-0 N7494-0 N47612-0
156921,156922,U64209,11/11/2019 7:11:07 AM,,N35729-0 N15830-0 N48759-0 N21519-1 N27581-0 N18870-0 N56193-0 N62570-0 N32437-0 N57090-0 N55689-0 N65236-0 N33848-0 N61214-0 N13801-0 N38201-0 N53318-0 N6693-0 N50469-0 N4912-0 N63538-0 N49279-0 N54300-0 N35937-0


In [125]:
len(behavior_df)

156965

In [133]:
behavior_df[behavior_df['UserID'].map(behavior_df['UserID'].value_counts()) >1].sort_values(by = ['UserID','ImpressionID'])

Unnamed: 0,ImpressionID,UserID,Time,History,Impressions
46639,46640,U1000,11/14/2019 1:00:56 AM,N29641 N1789 N41244,N16148-0 N43628-0 N13094-0 N32774-0 N57034-0 N29739-1 N39399-0 N55949-0 N37027-0 N49712-0 N14436-0 N47346-0 N55555-0 N55976-0 N20793-0 N50601-0 N5146-0 N18522-0 N59138-0 N7670-1 N55488-0 N8015-0 N614-0 N33831-0 N25295-0 N55275-0 N43301-0 N36016-0 N40130-0 N48875-0 N41698-0 N56711-0 N45509-0 N22257-0 N7121-0 N11817-0 N41612-0 N39187-0 N16618-0 N59214-0 N60161-0 N55281-0 N10960-0 N8029-0 N43487-0 N35387-0 N9997-0 N39005-0 N42457-0 N64174-0 N63174-0 N16282-0 N58660-0 N44249-0 N1012-0 N6216-0 N15719-0 N20576-0 N16844-0 N3957-0 N50872-0 N36261-0 N20770-0 N41122-0 N7618-0 N10552-0 N29517-0 N61787-0 N53875-0 N37194-0 N64412-0 N30713-0 N17759-0 N20036-0 N19867-0 N23805-0 N12124-0 N3031-0 N3841-0 N50014-0 N10616-0 N62894-0 N5652-0 N4021-0 N8595-0 N21701-0 N26233-0 N5048-0 N56214-0 N3167-0 N23272-0 N54752-0
74344,74345,U1000,11/14/2019 10:37:21 PM,N29641 N1789 N41244,N14478-0 N58264-0 N7494-0 N46917-0 N22257-0 N11378-0 N2235-0 N28072-0 N3841-0 N64459-0 N47652-0 N30290-0 N41934-0 N50055-0 N31552-0 N22978-0 N8595-0 N61697-0 N29490-0 N57007-0 N64037-0 N6837-0 N55712-0 N27289-0 N39369-0 N48707-0 N2960-0 N21571-0 N7852-0 N31141-0 N39683-0 N57327-0 N48487-0 N12446-0 N26063-0 N47280-0 N10051-0 N50484-0 N55322-0 N60747-0 N61829-0 N22975-0 N58081-0 N54792-0 N63913-0 N54803-0 N42950-0 N56211-0 N23767-0 N398-0 N37233-0 N483-0 N61233-0 N27737-0 N58656-1 N34930-0 N42515-0
86766,86767,U1000,11/13/2019 11:16:18 PM,N29641 N1789 N41244,N35273-0 N7618-0 N55281-0 N4021-0 N16148-0 N3841-0 N54752-0 N61787-0 N1012-0 N57034-0 N55976-0 N59138-0 N27869-0 N21128-0 N5048-0 N28767-0 N48875-0 N49712-0 N4642-0 N16282-0 N18522-0 N13094-0 N7121-0 N37194-0 N33831-0 N36252-0 N14436-0 N64174-0 N41698-0 N53875-1 N8015-0 N23272-0 N35387-0 N16844-0 N55949-0 N39399-0 N3957-0 N32774-0 N36261-0 N3031-0 N42457-0 N37027-0 N5652-0 N47624-0 N41122-0 N55555-0 N9997-0 N51287-0 N40130-0 N16618-0 N14029-0 N56214-0 N58660-0 N62894-0 N38053-0 N8029-0
60202,60203,U10001,11/14/2019 5:46:54 AM,N47937 N51706 N56447 N61319 N27644 N18030 N27256 N33976 N34562 N2735 N20639 N22816 N56814 N46444 N59380,N42860-0 N62750-0 N33025-0 N3167-0 N51975-0 N40109-0 N7670-0 N25324-0 N39187-0 N58660-0 N5597-0 N43487-0 N32854-0 N50288-0 N7618-0 N11817-0 N56711-0 N36016-0 N19776-0 N3957-0 N7121-0 N26673-0 N50014-0 N8791-0 N9009-0 N19915-0 N58814-0 N1031-1 N46923-0 N16844-0 N54752-0 N30071-0 N16419-0 N35387-0 N46186-0 N16777-0 N38442-0 N63174-0 N59685-0 N10833-1 N30089-0 N64174-0 N10960-0 N5652-0 N54023-0 N50872-0 N17031-0 N55555-0 N60272-0 N18522-0 N42457-0 N31406-0 N5146-0 N9135-0 N14436-0 N29739-0 N17759-0 N45509-0 N47098-0 N51970-0 N27638-0 N3894-0 N46066-0 N6664-0 N60550-0 N27869-0 N38695-0 N16439-0 N8595-0 N20576-0 N43301-0 N8015-0 N11940-0 N59138-0 N48875-0 N37854-0
155922,155923,U10001,11/11/2019 5:30:21 AM,N47937 N51706 N56447 N61319 N27644 N18030 N27256 N33976 N34562 N2735 N20639 N22816 N56814 N46444 N59380,N35729-0 N56598-0 N48759-0 N49685-0 N33632-0 N33848-0 N35937-1 N27581-0
...,...,...,...,...,...
76920,76921,U9996,11/13/2019 10:07:24 AM,N28296 N4719 N60340 N31165 N8448,N287-1 N64094-0 N51048-0 N39010-0
120621,120622,U9996,11/14/2019 8:41:12 AM,N28296 N4719 N60340 N31165 N8448,N38779-0 N1539-0 N45523-0 N40109-0 N50872-0 N29212-0 N23877-0 N6477-0 N47098-1
154397,154398,U9996,11/14/2019 8:40:37 AM,N28296 N4719 N60340 N31165 N8448,N40109-0 N6477-0 N23877-0 N23446-1 N50872-0 N38779-0
62952,62953,U9997,11/13/2019 2:41:42 PM,N46759 N46990 N9072 N11727 N50744 N38367 N33333 N55285 N90 N64836 N64459 N18106 N11929,N35273-0 N4510-0 N48410-1 N29170-0 N26262-0 N57151-0 N7618-0 N8643-0 N56214-0 N22417-0 N35047-0 N14029-0 N36261-0 N51048-0 N36221-0 N38215-0 N17117-0 N37437-0 N47061-0 N13907-0 N8509-0


In [136]:
behavior_df = behavior_df.dropna(subset=['History'])

In [137]:
behavior_df['History'].isna().sum()

0

In [138]:
len(behavior_df)

153727

In [139]:
#split NewsID in history into separate rows :
split_history_df = behavior_df['History'].str.split(' ', expand=True).stack().reset_index(level=1, drop=True).reset_index(name='NewsID')

In [140]:
split_history_df.head()

Unnamed: 0,index,NewsID
0,0,N55189
1,0,N42782
2,0,N34694
3,0,N45794
4,0,N18445


In [144]:
expanded_df = behavior_df.merge(split_history_df, left_index=True, right_on='index').drop(columns=['index'])
expanded_df.head(10)


Unnamed: 0,ImpressionID,UserID,Time,History,Impressions,NewsID
0,1,U13740,11/11/2019 9:05:58 AM,N55189 N42782 N34694 N45794 N18445 N63302 N10414 N19347 N31801,N55689-1 N35729-0,N55189
1,1,U13740,11/11/2019 9:05:58 AM,N55189 N42782 N34694 N45794 N18445 N63302 N10414 N19347 N31801,N55689-1 N35729-0,N42782
2,1,U13740,11/11/2019 9:05:58 AM,N55189 N42782 N34694 N45794 N18445 N63302 N10414 N19347 N31801,N55689-1 N35729-0,N34694
3,1,U13740,11/11/2019 9:05:58 AM,N55189 N42782 N34694 N45794 N18445 N63302 N10414 N19347 N31801,N55689-1 N35729-0,N45794
4,1,U13740,11/11/2019 9:05:58 AM,N55189 N42782 N34694 N45794 N18445 N63302 N10414 N19347 N31801,N55689-1 N35729-0,N18445
5,1,U13740,11/11/2019 9:05:58 AM,N55189 N42782 N34694 N45794 N18445 N63302 N10414 N19347 N31801,N55689-1 N35729-0,N63302
6,1,U13740,11/11/2019 9:05:58 AM,N55189 N42782 N34694 N45794 N18445 N63302 N10414 N19347 N31801,N55689-1 N35729-0,N10414
7,1,U13740,11/11/2019 9:05:58 AM,N55189 N42782 N34694 N45794 N18445 N63302 N10414 N19347 N31801,N55689-1 N35729-0,N19347
8,1,U13740,11/11/2019 9:05:58 AM,N55189 N42782 N34694 N45794 N18445 N63302 N10414 N19347 N31801,N55689-1 N35729-0,N31801
9,2,U91836,11/12/2019 6:11:30 PM,N31739 N6072 N63045 N23979 N35656 N43353 N8129 N1569 N17686 N13008 N21623 N6233 N14340 N48031 N62285 N44383 N23061 N16290 N6244 N45099 N58715 N59049 N7023 N50528 N42704 N46082 N8275 N15710 N59026 N8429 N30867 N56514 N19709 N31402 N31741 N54889 N9798 N62612 N2663 N16617 N6087 N13231 N63317 N61388 N59359 N51163 N30698 N34567 N54225 N32852 N55833 N64467 N3142 N13912 N29802 N44462 N29948 N4486 N5398 N14761 N47020 N65112 N31699 N37159 N61101 N14761 N3433 N10438 N61355 N21164 N22976 N2511 N48390 N58224 N48742 N35458 N24611 N37509 N21773 N41011 N19041 N25785,N20678-0 N39317-0 N58114-0 N20495-0 N42977-0 N22407-0 N14592-0 N17059-1 N33677-0 N7821-0 N6890-0,N31739


In [145]:
len(expanded_df)

5107639

In [150]:
expanded_df['History'].isna().sum()

0

In [152]:
df_merged = pd.merge(expanded_df, news_df, on='NewsID', how='left')
df_merged.head(10)

Unnamed: 0,ImpressionID,UserID,Time,History,Impressions,NewsID,Category,SubCategory,Title,Abstract
0,1,U13740,11/11/2019 9:05:58 AM,N55189 N42782 N34694 N45794 N18445 N63302 N10414 N19347 N31801,N55689-1 N35729-0,N55189,tv,tvnews,"'Wheel Of Fortune' Guest Delivers Hilarious, Off The Rails Introduction","We'd like to solve the puzzle, Pat: Blair Davis' loveless marriage? On Monday, ""Wheel of Fortune"" welcomed as a new contestant trucking business owner Blair Davis, who offered a biting introduction for himself. When host Pat Sajak asked the man from Cardiff, California, about his family, Davis plunged into one of the darkest personal summaries the show has likely ever heard. ""I've been trapped in a loveless marriage for the last 12 years to an..."
1,1,U13740,11/11/2019 9:05:58 AM,N55189 N42782 N34694 N45794 N18445 N63302 N10414 N19347 N31801,N55689-1 N35729-0,N42782,sports,baseball_mlb,Three takeaways from Yankees' ALCS Game 5 victory over the Astros,The Yankees kept hope alive thanks to some impressive starting pitching and a pair of early home runs.
2,1,U13740,11/11/2019 9:05:58 AM,N55189 N42782 N34694 N45794 N18445 N63302 N10414 N19347 N31801,N55689-1 N35729-0,N34694,tv,tvnews,Rosie O'Donnell: Barbara Walters Isn't 'Up to Speaking to People' Right Now,Rosie O'Donnell: Barbara Walters Isn't 'Up to Speaking to People' Right Now
3,1,U13740,11/11/2019 9:05:58 AM,N55189 N42782 N34694 N45794 N18445 N63302 N10414 N19347 N31801,N55689-1 N35729-0,N45794,news,newscrime,"Four flight attendants were arrested in Miami's airport after bringing in thousands in cash, police say","Four American Airlines flight attendants were arrested at the Miami International Airport and charged with money laundering after bringing large amounts of cash into the country, police said."
4,1,U13740,11/11/2019 9:05:58 AM,N55189 N42782 N34694 N45794 N18445 N63302 N10414 N19347 N31801,N55689-1 N35729-0,N18445,sports,football_ncaa,Michigan sends breakup tweet to Notre Dame as series goes on hold,"Parting is such sweet sorrow, say the Wolverines."
5,1,U13740,11/11/2019 9:05:58 AM,N55189 N42782 N34694 N45794 N18445 N63302 N10414 N19347 N31801,N55689-1 N35729-0,N63302,lifestyle,lifestylebuzz,This Wedding Photo of a Canine Best Man Captures Just How Deep a Dog's Love Truly Is,"When Mark Doublet made his dog, Marley, the best man at his wedding, he took his duties very seriously, and a photo of them together has gone viral."
6,1,U13740,11/11/2019 9:05:58 AM,N55189 N42782 N34694 N45794 N18445 N63302 N10414 N19347 N31801,N55689-1 N35729-0,N10414,movies,movienews,"Robert Evans, 'Chinatown' Producer and Paramount Chief, Dies at 89","Robert Evans, the Paramount executive who produced ""Chinatown"" and ""Urban Cowboy"" and whose life became as melodramatic and jaw-dropping as any of his films, died on Saturday night. He was 89. Even though Hollywood history is filled with colorful characters, few can match the tale of Evans, whose life would seem far-fetched if it were"
7,1,U13740,11/11/2019 9:05:58 AM,N55189 N42782 N34694 N45794 N18445 N63302 N10414 N19347 N31801,N55689-1 N35729-0,N19347,news,newspolitics,Former US Senator Kay Hagan dead at 66,"Former U.S. Sen. Kay Hagan, a one-time Capitol Hill intern who went on to become North Carolina's first Democratic female senator, died Monday at her Greensboro home. She was 66."
8,1,U13740,11/11/2019 9:05:58 AM,N55189 N42782 N34694 N45794 N18445 N63302 N10414 N19347 N31801,N55689-1 N35729-0,N31801,news,newspolitics,Joe Biden reportedly denied Communion at a South Carolina church because of his stance on abortion,Joe Biden has a complicated history with the Catholic Church.
9,2,U91836,11/12/2019 6:11:30 PM,N31739 N6072 N63045 N23979 N35656 N43353 N8129 N1569 N17686 N13008 N21623 N6233 N14340 N48031 N62285 N44383 N23061 N16290 N6244 N45099 N58715 N59049 N7023 N50528 N42704 N46082 N8275 N15710 N59026 N8429 N30867 N56514 N19709 N31402 N31741 N54889 N9798 N62612 N2663 N16617 N6087 N13231 N63317 N61388 N59359 N51163 N30698 N34567 N54225 N32852 N55833 N64467 N3142 N13912 N29802 N44462 N29948 N4486 N5398 N14761 N47020 N65112 N31699 N37159 N61101 N14761 N3433 N10438 N61355 N21164 N22976 N2511 N48390 N58224 N48742 N35458 N24611 N37509 N21773 N41011 N19041 N25785,N20678-0 N39317-0 N58114-0 N20495-0 N42977-0 N22407-0 N14592-0 N17059-1 N33677-0 N7821-0 N6890-0,N31739,news,newscrime,"'Habitual stowaway' arrested again at O'Hare, Chicago police say","A woman arrested at O'Hare International Airport on Friday night was identified by Chicago police as ""known habitual stowaway"" Marilyn Hartman."


In [153]:
len(df_merged)

5107639

In [155]:
pd.set_option('display.max_colwidth', None)
df_concat = df_merged.groupby('ImpressionID')['Title'].apply(lambda x: ';'.join(x)).reset_index()
df_concat = df_concat.rename(columns={'Title': 'Titles'})
df_concat.head(10)

Unnamed: 0,ImpressionID,Titles
0,1,"'Wheel Of Fortune' Guest Delivers Hilarious, Off The Rails Introduction;Three takeaways from Yankees' ALCS Game 5 victory over the Astros;Rosie O'Donnell: Barbara Walters Isn't 'Up to Speaking to People' Right Now;Four flight attendants were arrested in Miami's airport after bringing in thousands in cash, police say;Michigan sends breakup tweet to Notre Dame as series goes on hold;This Wedding Photo of a Canine Best Man Captures Just How Deep a Dog's Love Truly Is;Robert Evans, 'Chinatown' Producer and Paramount Chief, Dies at 89;Former US Senator Kay Hagan dead at 66;Joe Biden reportedly denied Communion at a South Carolina church because of his stance on abortion"
1,2,"'Habitual stowaway' arrested again at O'Hare, Chicago police say;Search resumes for worker missing in New Orleans Hard Rock Hotel building site collapse that killed 2;Coroner IDs man found dead in Klondike neighborhood;China already snapping up U.S. farm goods, Trump claims;Motorcyclist killed in crash on New Cut Road identified;Wedding shooter may have been avenging late stepfather's murder, authorities say;Elderly Japan woman rescued from Typhoon Hagibis dies after falling from helicopter; storm kills at least 33;Officer who shot naked man found not guilty of murder;Suspect in Henry County murder, kidnapping arraigned;Harley-Davidson halts production of new electric motorcycles;A 'one in a million' yellow cardinal, dubbed 'Sunny,' seen in Florida;Boxer Patrick Day dies after suffering traumatic brain injury in super welterweight fight;Security camera spots SOS from woman lost for three days in Australia;Ex-Maverick to pay for funeral of woman fatally shot in home by police;Former Florida cop won't serve jail time after accidentally killing woman, 73, in training exercise;Florida police tell man to quit calling about stolen marijuana;Police arrest suspect in Bashford Manor double shooting;Missouri man missing for a week found alive in car that crashed into ravine;Suspect Alfredo Vazquez pleads guilty in Clark County judges' shooting case;Suspect in crossbow murder of his father arraigned;Officer placed on leave after threatening teen skaters at gunpoint;Three gunmen killed in Brazil airport raid, kidnapping: police;After being pushed close to extinction, this humpback whale population is making a comeback;Man shot in the chest on Floyd Street;UPDATE: Coroner reveals identity of victim in Beechmont neighborhood shooting;Archaeologists open ancient coffins in Egypt;30 killed in DR Congo bus accident;Arrest made in connection to stabbing in South Louisville;Cincinnati school board candidate accused of 'pretending to be black' in his ads;Group clears plastic drums, tires from Falls of the Ohio State Park;The size of a bus, rare and endangered whale shark spotted off Florida coast;Road built by biblical villain uncovered in Jerusalem;Teen given 8 years in prison for gang-related hate crime of student caught on tape;The Best Seafood Restaurant in All 50 States;Snoop Dogg bringing 'I Wanna Thank Me' tour to Louisville Palace;Chris, the famously woolly sheep, has died;Missing West Point cadet found dead, officials say;16-year-old injured in shooting near Churchill Downs;16 sentenced to death for burning alive Bangladesh teen;Coast Guard searches for man who fell from cruise off Texas;U.S. bans all flights to Cuba outside Havana in latest crackdown;Homeowner says a mystery object damaged his house. It didn't fall from a plane, FAA says;Police: 4 charged with attempted murder month after shooting in SW Louisville;Judge: Brad Pitt, others can be sued over New Orleans homes;Police Arrest Motorcyclist Who Led High-Speed Chase And Went Home Thinking He Escaped;He grew a 910-pound pumpkin and then used it as a boat;Cranes remove bus partially swallowed by Pittsburgh sinkhole;Grandfather charged in death of toddler who fell on cruise ship in Puerto Rico;Multiple Houses on Fire After Plane Crashes in NJ;Nearly $400M worth of cocaine and marijuana intercepted by US Coast Guard;Oldham County man charged, injuring victim during shooting practice;Former Fox News hosts want out of non-disclosure agreements;Teen wins science competition with liquid bandage invention;A lawyer reveals a legal nightmare you can face on a cruise ship;Barneys Is Sold for Scrap, Ending an Era;Louisville man who shot at LMPD officer sentenced to 10 more years in prison;A White Woman Searches for Her Black Family;Florida air show crash: Pilot killed when plane crashes;Suspects identified following hours-long SWAT standoff on South 34th Street;Niagara Falls: Boat stuck on rocks 101 years dislodged;The News In Cartoons;Family says 16-year-old was victim hit, killed on I-64;Woman stabbed in 2015 home invasion suing 911 dispatch company;National Sandwich Day: Where to score free subs and meal deals Sunday;Resident recalls moments leading up to fatal shooting in Russell neighborhood;Niagara Falls: Boat stuck on rocks 101 years dislodged;A secretive family landing a $4 billion haul has everyone guessing;Pope Lick Legend: Loved ones of those who died on trestle push for change;2 Murder suspects escape from Monterey County, California, Jail;Pirates kidnap nine crew members from cargo ship off coast of Benin;The Sante Fe High School massacre suspect isn't fit to stand trial and will get psychiatric treatment, attorney says;Krispy Kreme orders Minnesota student who bought, resold doughnuts to 'shut down operations';First round of auctions begin for Joe Ley Antiques;13-year-old charged with two murders escapes custody in North Carolina;Before his execution, a death row inmate told his victim's family he forgives them;Brazil oil auction a 'total disaster' as bidders stay away;Former Jewish Hospital nurse to be arraigned on sexual abuse of patient charge;A man dies in Hawaii after falling into a lava tube in his yard;Lumber spilled on I-71 following rollover crash; Lanes shut down;For the first time, a man has been sentenced for sexual slavery by the International Criminal Court. He got 30 years.;Kroger shooting: Suspect coming out of store fires at officers before they shoot him dead;OJ Simpson suing Vegas hotel for defamation over 2017 TMZ story claiming 'belligerence'"
2,3,"Couple Didn't Know Why Car Was Running Strangely Then They Popped The Hood;100 Outrageous Things Homeowners Have Done to Their Houses;Iowa family discovers horrifying basement filled with blood;Report: Three-time Super Bowl-winning RB LeGarrette Blount works out for Raiders;'The Masked Singer': Find Out Who the Eagle Was!;Wild fall freeze: Utah falls close to minus-35 as West shivers, while bitter cold oozes east;Beer-holding Nationals fan explained why he used his chest to block HR ball;The best sandwich shop in every US state and DC;Walmart's Pre-Holiday Deals Are Live Right Now;100 famous NFL players who played for teams you forgot about;A Los Angeles mansion once listed for $250 million has finally sold for 62% off after 3 years. Take a look inside what was once America's most expensive home.;The world's skinniest skyscraper is almost completed. I toured its first luxury condo take a look inside the NYC tower that's 24 times taller than it is wide.;What Tom Brady, Lamar Jackson Told Each Other After Patriots-Ravens;Man Pulled From Path Of Oncoming BART Train;Woman Finds A Giant Wild Snake Just Chilling In Her Bed;Here Are the Biggest Deals We're Anticipating for Black Friday"
3,4,"Former Deadliest Catch Star Jerod Sechrist Arrested, Charged with Heroin Possession;Off to the World Series, these Nationals have proved everyone wrong;Woman, suspect dead at 'Tarzan' actor Ron Ely's California residence;The megaprojects that changed countries forever;The world's largest nuclear power producer is melting down;Former NBA first-round pick Jim Farmer arrested in sex sting operation;Couple Cancels Wedding, Keeps $30K as ""Donation"" for Honeymoon Instead;Clippers set bad precedent resting Kawhi Leonard against Bucks;A sneak peek at Hudson Yards' Edge to open March 2020;7 biggest takeaways from the opening College Football Playoff rankings"
4,5,The Difference Between Green and Orange Antifreeze;Road built by biblical villain uncovered in Jerusalem;Boat inches closer to Niagara Falls edge after being grounded for century;24 Ways to Shrink Your Belly in 24 Hours
5,6,"Eliud Kipchoge runs 1:59 marathon, first to break 2 hours;Tens of thousands of Hong Kong protesters plead for U.S. help;Murder warrant for Fort Worth officer says woman he killed was holding gun in her home;Trooper pulls over to save flag on highway;Trump appears to confirm U.S. nukes are in Turkey, an admission that would break with longstanding protocol;Angels Hire Joe Maddon As New Manager;Cat Sees Rain For The First Time And Loses Her Mind;Giuliani Mixes His Business With Role as Trump's Lawyer;ALCS Game 6 Highlights: Yankees vs. Astros;Kate Upton Shares Rare Pic of Daughter Cheering Daddy On!;ALCS Game 6 Highlights: Yankees vs. Astros;Astros beat Yankees, will face Nationals in World Series;30 killed in DR Congo bus accident;Rachael Ray finally opens restaurant with no physical location;'He fought them,' father says of son killed on birthday defending his family in home;84-Year-Old Army Veteran Challenges TSA Agent to Push-Up Contest Before Her Flight;16 transgender celebrities who are changing television;Watch this Missouri bridge be demolished after 90 years;A man in Brazil accidentally blew up his yard while trying to kill some cockroaches;Does Tire Rotation Include Tightening Lug Nuts? Michigan Court Thinks About It For Awhile, Concludes 'No';Four American Airlines flight attendants arrested at Miami airport, accused of money laundering;Why did the Phillies hire new manager Joe Girardi?;Deer fatally attacks hunter who shot him;Drone video shows the devastated compound where al-Baghdadi died;Nationals navigating through the postseason using just six pitchers;New details emerge on raid that led to the death of ISIS leader;Father makes Halloween costumes for his son with cerebral palsy;Islamic State defector inside Baghdadi's hideout critical to success of raid, officials say;Nationals win first World Series title, storming back on Astros in Game 7, 6-2;ISIS names successor to slain leader al-Baghdadi;The News In Cartoons;Young trick-or-treater refills stranger's empty candy bowl with his own;Missing New Hampshire couple's bodies found buried on a Texas beach;The News In Cartoons;Pregnant Florida woman uses AR-15 to fatally shoot armed intruder;Britney Spears' Abs Are on Full Display In Her Newest Workout Video"
6,7,"Man charged with 5 counts of first-degree murder in shooting rampage at Chicago condo;The Holy Grail Of Jeep Grand Cherokees Sits On An Old Wisconsin Dairy Farm, But It May Be Doomed;Four charged with repeatedly escaping prison, returning with booze and phones;Report: Angels could be fined up to $2M for not reporting Skaggs' opioid use;Coast Guard offloads $92 million worth of smuggled cocaine in San Diego;Why is Giancarlo Stanton on the Yankees' playoff roster?;Rihanna Slays in Sexy Black Bikini in Slow-Motion Video -- Watch!;Scarlett Johansson Says She Was 'Surprised' by Colin Jost's Proposal: 'He Killed It';Illegal vapes traced to California woman who was CBD pioneer;30 Funny Dog Photos You Need to See;Joey 'the Clown' Lombardo, infamous Chicago mobster, dead at 90, reports say;Florida man arrested in beating death of peeping Tom;Burger King's new 'Ghost Whopper' is here to haunt your Halloween order;'Go back to work': Outcry over deaths on Amazon's warehouse floor;Rob Kardashian Appears Thinner at Kim Kardashian's Birthday Party in Rare Video;Can You Find the Ghost Hidden Among the Skeletons in This Tricky Brainteaser?;Video allegedly shows police officer roughing up 11-year-old girl;Soto, Nationals edge Astros in World Series opener;California sheriff's deputy shot dead, ride-along injured in 'active shooter incident,' officials say;16 transgender celebrities who are changing television;Man threatened school to distract from robbery, police say;We Hope the Ford Bronco Has Taken Notes on the Suzuki Jimny;Nationals still uncertain on Scherzer status after injection;Hyundai fuel cell semi concept makes the Tesla Semi look boring;How did the Nats respond to a controversial call? By leaving absolutely no doubt.;Kevin Spacey Won't Be Charged in Sexual Assault Case After Accuser Dies;How To Open a Can Without a Can Opener;Tent Spent: We Try Luxury Camping in a $100,000 Airstream;TSA Agents Stop Passenger With Loaded Gun Before Boarding Plane;Couple Accused of Injecting Childhood Friend With Meth and Filming Her As She Died;ISIS Leader Paid Rival for Protection but Was Betrayed by His Own;Hennessey Maximus Is a 1000-HP Jeep Gladiator from Hell;She fought for human rights on a tiny island that banned guns. Then she was shot dead.;Bears' Khalil Mack calls out Eagles C Jason Kelce's controversial tactics in offsides penalties;One of FBI's Most Wanted fugitives offers surrender"
7,8,UFC Tampa results: Jedrzejczyk dominates Waterson;Franklin upset players wore Sutherland T-shirts without permission;New Mexico DE Nahje Flowers dies at 21;Evander Kane sued by hotel for gambling debt built during playoffs
8,9,"Police warn against THC-laced Halloween candy;Serial stowaway arrested at Chicago airport for 2nd time;California governor signs measure banning 'lunch shaming';Uh-Oh! Carrie Ann Inaba Falls on 'Dancing With the Stars';Wisconsin jury awards $450,000 in Sandy Hook defamation case;Bill Macy, Bea Arthur's Husband on 'Maude,' Dies at 97;Hillary Clinton backs out of DC event also set to include Tulsi Gabbard, Kirstjen Nielsen: report;Rosie O'Donnell: Barbara Walters Isn't 'Up to Speaking to People' Right Now;Singer stands up to heckler who told her to take shirt off;Bed, Bath and Beyond Pulls 'Blackface Pumpkins From the Shelves;Coast Guard searches for man who fell from cruise off Texas;Woman stabbed to death by son in Plainville, police say;'Potentially historic wind event' over weekend could inflame California wildfires;17 Hallmark Movies That Are Totally Worth Watching;Tourist in Venice kidnaps local woman in row over exchange rate;Police: Off-Duty Dallas Officer Mistakenly Shoots, Injures Adult Son Thinking He Was An Intruder;Joe Biden reportedly denied Communion at a South Carolina church because of his stance on abortion;Former NBA first-round pick Jim Farmer arrested in sex sting operation;Niagara Falls: Boat stuck on rocks 101 years dislodged"
9,10,"Beloved grandfather's body pulled from collapsed Hard Rock hotel;Saddleridge fire calms down, heavy rains on East Coast;JetBlue Plane Makes Emergency Landing at JFK Airport;Iran alleges foreign government behind 'treacherous' ship attack;Why are most aircraft painted predominantly white?;Harley-Davidson halts production of new electric motorcycles;Cracks found on 38 of 810 Boeing 737 NG jets inspected globally;McConnell tees off on Democrats over impeachment;Nevada gambling bosses move to bar Wynn from casino industry;Health warning lifted over fire at California oil facility;'I'm 73 and fed up with California and want a gun-friendly, affordable city with good weather so where should I retire?';Southwest Airlines evacuates Chicago flight after blown tires: reports;23 Foods That Help Lower High Blood Pressure;25 Cars No One Wanted to Buy;Why it's so hard for planes to land on water;After Soldiers Surrender El Chapo's Son, a Shocked Mexican City Sighs With Relief;Four flight attendants were arrested in Miami's airport after bringing in thousands in cash, police say;Mitch McConnell snubbed by Elijah Cummings' pallbearer in handshake line at U.S. Capitol ceremony;Explosive new wildfire drives thousands from homes near Los Angeles;DOJ sues Walmart for allegedly discriminating against a Navy reserve officer;Firefighting Plane Battles Maria Fire Near Santa Paula;Illegal drones ground water-dropping helicopters at critical moment in Maria fire battle;Chinese pilot suspended after woman poses in cockpit with drinks;Inside the Sheikh of Dubai's 800-acre Kentucky horse farm;Keanu Reeves holds hands with Alexandra Grant as the Internet swoons, plus more news;Lucky Texas Storm Chaser Sees Hailstones Whizz Past Him During Tornado;Joe Alwyn on Buzz Around Taylor Swift Romance: 'I'll Just Turn it Off';40 Fun Facts About Jennifer Aniston"


In [156]:
len(df_concat)

153727

In [159]:
final_user_df = pd.merge(behavior_df , df_concat, on="ImpressionID", how='inner')
final_user_df.head()

Unnamed: 0,ImpressionID,UserID,Time,History,Impressions,Titles
0,1,U13740,11/11/2019 9:05:58 AM,N55189 N42782 N34694 N45794 N18445 N63302 N10414 N19347 N31801,N55689-1 N35729-0,"'Wheel Of Fortune' Guest Delivers Hilarious, Off The Rails Introduction;Three takeaways from Yankees' ALCS Game 5 victory over the Astros;Rosie O'Donnell: Barbara Walters Isn't 'Up to Speaking to People' Right Now;Four flight attendants were arrested in Miami's airport after bringing in thousands in cash, police say;Michigan sends breakup tweet to Notre Dame as series goes on hold;This Wedding Photo of a Canine Best Man Captures Just How Deep a Dog's Love Truly Is;Robert Evans, 'Chinatown' Producer and Paramount Chief, Dies at 89;Former US Senator Kay Hagan dead at 66;Joe Biden reportedly denied Communion at a South Carolina church because of his stance on abortion"
1,2,U91836,11/12/2019 6:11:30 PM,N31739 N6072 N63045 N23979 N35656 N43353 N8129 N1569 N17686 N13008 N21623 N6233 N14340 N48031 N62285 N44383 N23061 N16290 N6244 N45099 N58715 N59049 N7023 N50528 N42704 N46082 N8275 N15710 N59026 N8429 N30867 N56514 N19709 N31402 N31741 N54889 N9798 N62612 N2663 N16617 N6087 N13231 N63317 N61388 N59359 N51163 N30698 N34567 N54225 N32852 N55833 N64467 N3142 N13912 N29802 N44462 N29948 N4486 N5398 N14761 N47020 N65112 N31699 N37159 N61101 N14761 N3433 N10438 N61355 N21164 N22976 N2511 N48390 N58224 N48742 N35458 N24611 N37509 N21773 N41011 N19041 N25785,N20678-0 N39317-0 N58114-0 N20495-0 N42977-0 N22407-0 N14592-0 N17059-1 N33677-0 N7821-0 N6890-0,"'Habitual stowaway' arrested again at O'Hare, Chicago police say;Search resumes for worker missing in New Orleans Hard Rock Hotel building site collapse that killed 2;Coroner IDs man found dead in Klondike neighborhood;China already snapping up U.S. farm goods, Trump claims;Motorcyclist killed in crash on New Cut Road identified;Wedding shooter may have been avenging late stepfather's murder, authorities say;Elderly Japan woman rescued from Typhoon Hagibis dies after falling from helicopter; storm kills at least 33;Officer who shot naked man found not guilty of murder;Suspect in Henry County murder, kidnapping arraigned;Harley-Davidson halts production of new electric motorcycles;A 'one in a million' yellow cardinal, dubbed 'Sunny,' seen in Florida;Boxer Patrick Day dies after suffering traumatic brain injury in super welterweight fight;Security camera spots SOS from woman lost for three days in Australia;Ex-Maverick to pay for funeral of woman fatally shot in home by police;Former Florida cop won't serve jail time after accidentally killing woman, 73, in training exercise;Florida police tell man to quit calling about stolen marijuana;Police arrest suspect in Bashford Manor double shooting;Missouri man missing for a week found alive in car that crashed into ravine;Suspect Alfredo Vazquez pleads guilty in Clark County judges' shooting case;Suspect in crossbow murder of his father arraigned;Officer placed on leave after threatening teen skaters at gunpoint;Three gunmen killed in Brazil airport raid, kidnapping: police;After being pushed close to extinction, this humpback whale population is making a comeback;Man shot in the chest on Floyd Street;UPDATE: Coroner reveals identity of victim in Beechmont neighborhood shooting;Archaeologists open ancient coffins in Egypt;30 killed in DR Congo bus accident;Arrest made in connection to stabbing in South Louisville;Cincinnati school board candidate accused of 'pretending to be black' in his ads;Group clears plastic drums, tires from Falls of the Ohio State Park;The size of a bus, rare and endangered whale shark spotted off Florida coast;Road built by biblical villain uncovered in Jerusalem;Teen given 8 years in prison for gang-related hate crime of student caught on tape;The Best Seafood Restaurant in All 50 States;Snoop Dogg bringing 'I Wanna Thank Me' tour to Louisville Palace;Chris, the famously woolly sheep, has died;Missing West Point cadet found dead, officials say;16-year-old injured in shooting near Churchill Downs;16 sentenced to death for burning alive Bangladesh teen;Coast Guard searches for man who fell from cruise off Texas;U.S. bans all flights to Cuba outside Havana in latest crackdown;Homeowner says a mystery object damaged his house. It didn't fall from a plane, FAA says;Police: 4 charged with attempted murder month after shooting in SW Louisville;Judge: Brad Pitt, others can be sued over New Orleans homes;Police Arrest Motorcyclist Who Led High-Speed Chase And Went Home Thinking He Escaped;He grew a 910-pound pumpkin and then used it as a boat;Cranes remove bus partially swallowed by Pittsburgh sinkhole;Grandfather charged in death of toddler who fell on cruise ship in Puerto Rico;Multiple Houses on Fire After Plane Crashes in NJ;Nearly $400M worth of cocaine and marijuana intercepted by US Coast Guard;Oldham County man charged, injuring victim during shooting practice;Former Fox News hosts want out of non-disclosure agreements;Teen wins science competition with liquid bandage invention;A lawyer reveals a legal nightmare you can face on a cruise ship;Barneys Is Sold for Scrap, Ending an Era;Louisville man who shot at LMPD officer sentenced to 10 more years in prison;A White Woman Searches for Her Black Family;Florida air show crash: Pilot killed when plane crashes;Suspects identified following hours-long SWAT standoff on South 34th Street;Niagara Falls: Boat stuck on rocks 101 years dislodged;The News In Cartoons;Family says 16-year-old was victim hit, killed on I-64;Woman stabbed in 2015 home invasion suing 911 dispatch company;National Sandwich Day: Where to score free subs and meal deals Sunday;Resident recalls moments leading up to fatal shooting in Russell neighborhood;Niagara Falls: Boat stuck on rocks 101 years dislodged;A secretive family landing a $4 billion haul has everyone guessing;Pope Lick Legend: Loved ones of those who died on trestle push for change;2 Murder suspects escape from Monterey County, California, Jail;Pirates kidnap nine crew members from cargo ship off coast of Benin;The Sante Fe High School massacre suspect isn't fit to stand trial and will get psychiatric treatment, attorney says;Krispy Kreme orders Minnesota student who bought, resold doughnuts to 'shut down operations';First round of auctions begin for Joe Ley Antiques;13-year-old charged with two murders escapes custody in North Carolina;Before his execution, a death row inmate told his victim's family he forgives them;Brazil oil auction a 'total disaster' as bidders stay away;Former Jewish Hospital nurse to be arraigned on sexual abuse of patient charge;A man dies in Hawaii after falling into a lava tube in his yard;Lumber spilled on I-71 following rollover crash; Lanes shut down;For the first time, a man has been sentenced for sexual slavery by the International Criminal Court. He got 30 years.;Kroger shooting: Suspect coming out of store fires at officers before they shoot him dead;OJ Simpson suing Vegas hotel for defamation over 2017 TMZ story claiming 'belligerence'"
2,3,U73700,11/14/2019 7:01:48 AM,N10732 N25792 N7563 N21087 N41087 N5445 N60384 N46616 N52500 N33164 N47289 N24233 N62058 N26378 N49475 N18870,N50014-0 N23877-0 N35389-0 N49712-0 N16844-0 N59685-0 N23814-1 N23446-0 N64174-0 N11817-0 N60550-0 N48225-0 N45509-0 N56711-0 N46821-0 N48017-0 N8015-0 N5364-0 N48722-0 N55555-0 N37348-0 N40109-0 N59495-0 N36226-0 N38779-0 N47346-0 N48875-0 N10960-0 N29739-0 N50872-0 N50592-0 N13131-0 N3839-0 N12330-0 N47098-0 N51570-0,"Couple Didn't Know Why Car Was Running Strangely Then They Popped The Hood;100 Outrageous Things Homeowners Have Done to Their Houses;Iowa family discovers horrifying basement filled with blood;Report: Three-time Super Bowl-winning RB LeGarrette Blount works out for Raiders;'The Masked Singer': Find Out Who the Eagle Was!;Wild fall freeze: Utah falls close to minus-35 as West shivers, while bitter cold oozes east;Beer-holding Nationals fan explained why he used his chest to block HR ball;The best sandwich shop in every US state and DC;Walmart's Pre-Holiday Deals Are Live Right Now;100 famous NFL players who played for teams you forgot about;A Los Angeles mansion once listed for $250 million has finally sold for 62% off after 3 years. Take a look inside what was once America's most expensive home.;The world's skinniest skyscraper is almost completed. I toured its first luxury condo take a look inside the NYC tower that's 24 times taller than it is wide.;What Tom Brady, Lamar Jackson Told Each Other After Patriots-Ravens;Man Pulled From Path Of Oncoming BART Train;Woman Finds A Giant Wild Snake Just Chilling In Her Bed;Here Are the Biggest Deals We're Anticipating for Black Friday"
3,4,U34670,11/11/2019 5:28:05 AM,N45729 N2203 N871 N53880 N41375 N43142 N33013 N29757 N31825 N51891,N35729-0 N33632-0 N49685-1 N27581-0,"Former Deadliest Catch Star Jerod Sechrist Arrested, Charged with Heroin Possession;Off to the World Series, these Nationals have proved everyone wrong;Woman, suspect dead at 'Tarzan' actor Ron Ely's California residence;The megaprojects that changed countries forever;The world's largest nuclear power producer is melting down;Former NBA first-round pick Jim Farmer arrested in sex sting operation;Couple Cancels Wedding, Keeps $30K as ""Donation"" for Honeymoon Instead;Clippers set bad precedent resting Kawhi Leonard against Bucks;A sneak peek at Hudson Yards' Edge to open March 2020;7 biggest takeaways from the opening College Football Playoff rankings"
4,5,U8125,11/12/2019 4:11:21 PM,N10078 N56514 N14904 N33740,N39985-0 N36050-0 N16096-0 N8400-1 N22407-0 N60408-0 N61497-0 N47412-0 N41220-0 N1940-0 N724-0 N11363-0 N261-0 N33883-0 N36807-0 N11967-0 N17896-0 N13486-0 N10413-0 N54274-0 N4247-0 N27497-0 N38512-0 N30253-0 N45389-0 N20015-0 N20678-0 N54003-0 N35850-0 N33261-0 N32010-0 N57426-0 N7419-0 N50023-0 N36446-0 N26940-0 N28495-0 N19318-0 N4936-0 N28414-0 N25108-0 N32791-0 N23563-0 N39317-0 N16166-0 N37058-0 N64851-0 N46992-0 N57327-0 N12995-0 N58363-0 N53084-0 N11094-0 N36436-0 N305-0 N58241-0 N33212-0 N6975-0 N58114-0 N3344-0 N25406-0 N4741-0 N33885-0 N20915-0 N44941-0 N57319-0 N36532-0 N61822-0 N20527-0,The Difference Between Green and Orange Antifreeze;Road built by biblical villain uncovered in Jerusalem;Boat inches closer to Niagara Falls edge after being grounded for century;24 Ways to Shrink Your Belly in 24 Hours


In [160]:
final_user_df['Titles'].isna().sum()

0

In [161]:
final_user_df.dtypes

ImpressionID     int64
UserID          object
Time            object
History         object
Impressions     object
Titles          object
dtype: object

In [162]:
final_user_df['Titles'] = final_df['Titles'].astype('string')

In [52]:
CHUNK_SIZE = 200
N_ROWS = 1000
prepped = []
progress_bar = tqdm(total= final_df.shape[0])

def chunk_df(df ,CHUNK_SIZE ):
    for i in range(0,len(df), CHUNK_SIZE): #for i in range(0,len(df), CHUNK_SIZE)
        yield df.iloc[i:i+CHUNK_SIZE]

chunk_num = 0
for chunk in chunk_df(final_df.head(N_ROWS) , CHUNK_SIZE):
    titles = chunk['Titles'].tolist()
    embeddings = get_embeddings(titles)
    prepped = [{'id':str(chunk_num*CHUNK_SIZE+i), 'values':embeddings[i],
                'metadata':{'UserID':chunk['UserID'].iloc[i]}} for i in range(0,len(titles))]
    chunk_num = chunk_num + 1
    if len(prepped) >= 200:
        try:
            index.upsert(prepped)
        except:
            print(f"Error upserting chunk {chunk_num}: {e}")
        prepped = []
    progress_bar.update(len(chunk))

#final
if prepped:
    try:
        index.upsert(prepped)
    except Exception as e:
        print(f"Error upserting final chunk: {e}")


  1%|          | 1000/156965 [01:55<5:01:30,  8.62it/s]
 98%|█████████▊| 153600/156965 [2:33:37<02:40, 20.92it/s] 

TypeError: 'NAType' object is not subscriptable