In [2]:
import pandas as pd
import numpy
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
import jieba
from snownlp import SnowNLP
import math

In [3]:
data1 = pd.read_csv('data/china/LLM-description/china/china_results._part0.csv')
data2 = pd.read_csv('data/china/LLM-description/china/china_results._part1.csv')
data3 = pd.read_csv('data/china/LLM-description/china/china_results._part2.csv')
data4 = pd.read_csv('data/china/LLM-description/china/china_results._part3.csv')
data5 = pd.read_csv('data/china/LLM-description/china/china_results._part4.csv')
data6 = pd.read_csv('data/china/LLM-description/china/china_results._part5.csv')
data7 = pd.read_csv('data/china/LLM-description/china/china_results._part6.csv')
data8 = pd.read_csv('data/china/LLM-description/china/china_results._part7.csv')
# combine all data into one
data = pd.concat([data1, data2, data3, data4, data5, data6, data7, data8], ignore_index=True)
print(f"length of data: {len(data)}")
data.head()

length of data: 63634


Unnamed: 0,用户ID,图片ID,实体,颜色,纹理,主题类型,情感
0,94459808@N00,3388967115,高山,绿色,岩石,自然风光,壮观的
1,94459808@N00,3393680646,悉尼歌剧院,白色,光滑,建筑,现代
2,94459808@N00,3389770958,高楼大厦,灰色,光滑,都市旅游,忙碌
3,62744044@N00,9120755474,大海,蓝色,波浪,自然,宁静
4,62744044@N00,9118604143,大海、游乐园、高山,蓝色、鲜艳的色彩,平滑、细腻,文化旅游、娱乐,欢乐、兴奋


In [4]:
# remove rows with missing values
data = data.dropna()
print(f"length of data after removing missing values: {len(data)}")

length of data after removing missing values: 55519


In [5]:
# check if there is repeated data
print(f"length of data after removing duplicates: {len(data.drop_duplicates())}")

length of data after removing duplicates: 55519


In [6]:
# remove column '实体'、'颜色' 、'纹理' 
data = data.drop(columns=['实体', '颜色', '纹理'])
data.to_csv('data/china/LLM-description/china/china_results.csv', index=False, encoding='gbk')

In [3]:
data = pd.read_csv('data/china/LLM-description/china/china_results.csv', encoding='gbk')

In [4]:
with open('hit_stopwords.txt', 'r', encoding='utf-8') as f:
    stopwords = set(f.read().splitlines())
# add user-defined stopwords
stopwords.update([' ', '','的','气氛','气息','主题','类型','和',':','：','旅游','传达','出'])
def tokenize(text):
    x = [word for word in jieba.cut(text) if word not in stopwords]
    res = []
    for item in x:
        if item == []:
            continue
        res.append(item[0:min(2,len(item))])
    if res == []:
        return ''
    return res[0]
data['tag'] = data['主题类型'].apply(tokenize)

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\MARSHA~1\AppData\Local\Temp\jieba.cache
Loading model cost 0.778 seconds.
Prefix dict has been built successfully.


In [5]:
def sentiment_score(text):
    s = SnowNLP(text)
    return round(s.sentiments,3)
data['emotion'] = data['情感'].apply(sentiment_score)

In [6]:
photo_attraction = pd.read_csv('data/china/photo-attraction/photo_attraction_onehot_china.csv')
result = []
for index, row in photo_attraction.iterrows():
    photo_ID = row['photo_ID']
    for col in photo_attraction.columns:
        if col != 'photo_ID' and row[col] == 1:
            attractionID = col
            result.append([photo_ID, attractionID])
result_df = pd.DataFrame(result, columns=['photo_ID', 'attractionID'])

In [8]:
# 去掉主题类型列
data = data.drop(columns=['主题类型', '情感'])
# 重命名图片ID列
data = data.rename(columns={'图片ID': 'photo_ID','用户ID':'user_ID'})

In [9]:
photo_attraction = pd.read_csv('data/china/photo-attraction/photo_attraction.csv')
print(len(data))
# merge data with photo_attraction on photo_ID
data = pd.merge(data, photo_attraction, on='photo_ID', how='inner')
# remove rows where tag is not empty list
data = data[data['tag'].apply(lambda x: x != '')]
data = data[data['emotion'].apply(lambda x: x != '')]
data.head()

41876


Unnamed: 0,user_ID,photo_ID,tag,emotion,attractionID
0,31389030@N07,5741605963,文化,0.778,hongkong_id0
1,27558220@N00,356951876,文化,0.889,beijing_id143
2,27568262@N00,324769617,文化,0.889,hongkong_id0
3,27568262@N00,327222903,都市,0.714,hongkong_id0
4,27568262@N00,366415126,自然,0.889,hongkong_id8


In [10]:
len(data)

35757

In [None]:
data.to_csv('data/china/features.csv', index=False, encoding='utf-8')

In [11]:
le = LabelEncoder()
all_tokens = set(token for token in data['tag'])
all_tokens.update(token for token in data['emotion'])
le.fit(list(all_tokens))
data['tag'] = le.transform(data['tag'])
data['emotion'] = le.transform(data['emotion'])


In [12]:
data.to_csv('data/china/features.csv', index=False, encoding='utf-8')

In [None]:
file_path = 'data/china/user-attraction/china.csv'
df = pd.read_csv(file_path)

In [13]:
print(max(data['tag']))
print(min(data['tag']))
print(min(data['emotion']))
print(max(data['emotion']))

276
189
0
188


In [23]:
import numpy as np
fields = []
for col in ['emotion', 'tag']:
    tmp = data[col].max() + 1
    fields.append(tmp)
fields = np.array(fields)

In [24]:
fields

array([189, 277])

In [25]:
np.array((0, *np.cumsum(fields)[:-1]), dtype=np.longlong)

array([  0, 189], dtype=int64)